In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

# loading data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')

# Data at first sight

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
v = train_df.columns
for f in v:
    dist_value = train_df[f].value_counts().shape[0]
    print('Variables {:>40} has {} distinct values'.format(f, dist_value))

In [None]:
v = test_df.columns
for f in v:
    dist_value = test_df[f].value_counts().shape[0]
    print('Variables {:>40} has {} distinct values'.format(f, dist_value))

In [None]:
missing = 0
for f in train_df.columns:
    missing += train_df[f].isnull().sum()
    print("Variables : {:>30}\t missings : {}".format(f, train_df[f].isnull().sum()))
print("Sum of missing_value : {}".format(missing))

In [None]:
train_df = train_df.drop(['Soil_Type15', 'Soil_Type7'], axis=1)

In [None]:
train_df.head()

# Metadata
- To facilitate the data management, we'll store meta-information about the variables in a DataFrame. This will be helpful when we want to select specific variables for analysis, visualization, modeling, ...

In [None]:
data = []

for f in train_df.columns:
    if f == 'Cover_Typet':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
        
    if 'Type' in f or 'Area' in f or f == 'Cover_Typet' or f == 'Id':
        level = 'nominal'
    elif 'cat' in f or f == 'Id':
        level = 'nominal'
    elif train_df[f].dtype == float:
        level = 'interval'
    elif train_df[f].dtype == int:
        level = 'ordinal'
        
    keep = True
    
    if f == 'Id':
        keep = False
    
    dtype = train_df[f].dtype
    
    f_dict = {
        'varname' : f,
        'role' : role,
        'level' : level,
        'keep' : keep,
        'dtype' : dtype
    }
    
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns = ['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [None]:
meta

In [None]:
v = meta[(meta.level == 'nominal') & meta.keep].index
train_df[v].describe()

In [None]:
v = meta[(meta.level == 'nominal') & meta.keep].index
for i in v:
    print(i)

In [None]:
v = meta[(meta.level == 'ordinal') & meta.keep].index
for i in v:
    print(i)

# visualization

In [None]:
s1 = train_df.sample(frac=0.2)
s2 = test_df.sample(frac=0.2)

In [None]:
i = 1
v = meta[(meta.level == 'ordinal') & meta.keep].index
plt.figure()
fig, ax = plt.subplots(2, 5,figsize=(20, 12))
for f in v:
    plt.subplot(2, 5, i)
    sns.histplot(s1[f], color="blue", kde=True, bins=100, label='train_'+f)
    sns.histplot(s2[f], color="olive", kde=True, bins=100, label='test_'+f)
    plt.xlabel(f, fontsize=9); plt.legend()
    i += 1
plt.show()

In [None]:
def corr_heatmap(v):
    correlations = train_df[v].corr()

    # Create color map ranging between two colors
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    fig, ax = plt.subplots(figsize=(30,10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.show();
    
v = meta[(meta.level == 'ordinal') & (meta.keep)].index
corr_heatmap(v)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

In [None]:
train_df['Cover_Type'].value_counts()

In [None]:
sns.catplot(x="Cover_Type", kind="count", palette="ch:.25", data=train_df)

In [None]:
test_df.columns

In [None]:
# train data
x = train_df.drop(columns=['Id', 'Cover_Type'])
y = train_df['Cover_Type']
# test data 
test_df = test_df.drop(columns=['Id', 'Soil_Type7', 'Soil_Type15'])

In [None]:
scaler = StandardScaler()
x = scaler.fit_transform(x)
test_df = scaler.transform(test_df)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle =True)

In [None]:
model = CatBoostClassifier(task_type='GPU')
model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_val)

In [None]:
print('Accuracy Score : ',accuracy_score(y_val, y_pred))

In [None]:
y_pred = model.predict(test_df)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
submission['Cover_Type'] = y_pred
submission.to_csv("submission.csv",index=False)
