In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import eli5

from sklearn import metrics
from eli5.sklearn import PermutationImportance
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool

# loading data

In [4]:
train_df = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
FEATURES = [col for col in train_df.columns if col not in ['Id', 'Cover_Type']]

# Metadata
- To facilitate the data management, we'll store meta-information about the variables in a DataFrame. This will be helpful when we want to select specific variables for analysis, visualization, modeling, ...

In [5]:
data = []

for f in train_df.columns:
    if f == 'Cover_Typet':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
        
    if 'Type' in f or 'Area' in f or f == 'Cover_Typet' or f == 'Id':
        level = 'nominal'
    elif 'cat' in f or f == 'Id':
        level = 'nominal'
    elif train_df[f].dtype == float:
        level = 'interval'
    elif train_df[f].dtype == int:
        level = 'ordinal'
        
    keep = True
    
    if f == 'Id':
        keep = False
    
    dtype = train_df[f].dtype
    
    f_dict = {
        'varname' : f,
        'role' : role,
        'level' : level,
        'keep' : keep,
        'dtype' : dtype
    }
    
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns = ['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [6]:
meta

# reduce memory

In [7]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [8]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

# Data at first sight

In [9]:
train_df.info()

In [10]:
v = train_df.columns
for f in v:
    dist_value = train_df[f].value_counts().shape[0]
    print('Variables {:>40} has {} distinct values'.format(f, dist_value))

In [11]:
v = test_df.columns
for f in v:
    dist_value = test_df[f].value_counts().shape[0]
    print('Variables {:>40} has {} distinct values'.format(f, dist_value))

In [12]:
missing = 0
for f in train_df.columns:
    missing += train_df[f].isnull().sum()
    print("Variables : {:>30}\t missings : {}".format(f, train_df[f].isnull().sum()))
print("Sum of missing_value : {}".format(missing))

In [13]:
v = meta[(meta.level == 'nominal') & meta.keep].index
train_df[v].describe()

In [14]:
for i in v:
    print(i)

In [15]:
v = meta[(meta.level == 'ordinal') & meta.keep].index
for i in v:
    print(i)

# visualization

In [16]:
s1 = train_df.sample(frac=0.2)
s2 = test_df.sample(frac=0.2)

In [17]:
i = 1
plt.figure()
fig, ax = plt.subplots(2, 5,figsize=(20, 12))
for f in v:
    plt.subplot(2, 5, i)
    sns.histplot(s1[f], color="blue", kde=True, bins=100, label='train_'+f)
    sns.histplot(s2[f], color="olive", kde=True, bins=100, label='test_'+f)
    plt.xlabel(f, fontsize=9); plt.legend()
    i += 1
plt.show()

In [18]:
def corr_heatmap(v):
    correlations = train_df[v].corr()

    # Create color map ranging between two colors
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(30,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.show();

corr_heatmap(v)

In [19]:
train_df['Cover_Type'].value_counts()

In [20]:
sns.catplot(x="Cover_Type", kind="count", palette="ch:.25", data=train_df)

In [21]:
test_df.columns

In [22]:
# train data
target = train_df['Cover_Type']
train_df.drop(columns=['Id', 'Cover_Type', 'Soil_Type7', 'Soil_Type15'], axis=1, inplace=True)

# test data 
test_df.drop(columns=['Id', 'Soil_Type7', 'Soil_Type15'], axis=1, inplace=True)
FEATURES.remove('Soil_Type7')
FEATURES.remove('Soil_Type15')

In [24]:
train_df["mean"] = train_df[FEATURES].mean(axis=1)
train_df["std"] = train_df[FEATURES].std(axis=1)
train_df["min"] = train_df[FEATURES].min(axis=1)
train_df["max"] = train_df[FEATURES].max(axis=1)

test_df["mean"] = test_df[FEATURES].mean(axis=1)
test_df["std"] = test_df[FEATURES].std(axis=1)
test_df["min"] = test_df[FEATURES].min(axis=1)
test_df["max"] = test_df[FEATURES].max(axis=1)

FEATURES.extend(['mean', 'std', 'min', 'max'])

In [25]:
train_df.head()

In [26]:
scaler = StandardScaler()
train_df = scaler.fit_transform(train_df)
test_df = scaler.transform(test_df)

In [27]:
X_train, X_val, y_train, y_val = train_test_split(train_df, target, test_size=0.2, shuffle =True)

In [28]:
model = CatBoostClassifier(silent=True, task_type='GPU').fit(X_train, y_train)
print("train set accuracy : ", accuracy_score(y_train, model.predict(X_train)))
print("test set accuracy : ", accuracy_score(y_val, model.predict(X_val)))



In [30]:
submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
submission['Cover_Type'] = model.predict(test_df)
submission.to_csv("submission.csv",index=False)
