#### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from lazypredict.Supervised import LazyClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV



## Tabular Playground Series - Dec 2021 (Kaggle Competition)
Competition to predict a categorical target based on a number of feature columns given in the data.

* Available here: [Tabular Playground Series - Dec 2021](https://www.kaggle.com/c/tabular-playground-series-dec-2021/overview)

### Data Imports and Data Inspection

**Findings:**

* Wildernes_area feature already OneHotEncoded
* Soil_type feature already OneHotEncoded


In [2]:
df = pd.read_csv('data/train.csv')
df.head() 

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,0,3189,40,8,30,13,3270,206,234,193,...,0,0,0,0,0,0,0,0,0,1
1,1,3026,182,5,280,29,3270,233,240,106,...,0,0,0,0,0,0,0,0,0,2
2,2,3106,13,7,351,37,2914,208,234,137,...,0,0,0,0,0,0,0,0,0,1
3,3,3022,276,13,192,16,3034,207,238,156,...,0,0,0,0,0,0,0,0,0,2
4,4,2906,186,13,266,22,2916,231,231,154,...,0,0,0,0,0,0,0,0,0,2


In [8]:
#df.info()

#### Features and Target for model

In [3]:
y = df['Cover_Type']
X = df.drop(columns = ['Id','Cover_Type'])

- Check for duplicate values

In [7]:
X.duplicated().sum()

0

#### Check for class imbalance
- Need to balance classes, will do that after getting model baseline

In [8]:
y.value_counts()

2    2262087
1    1468136
3     195712
7      62261
6      11426
4        377
5          1
Name: Cover_Type, dtype: int64

#### Split train/test 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 42)

### Create Baseline Model

In [13]:
model = RandomForestClassifier(n_estimators=30)

model.fit(X_train,y_train)

RandomForestClassifier(n_estimators=30)

In [14]:
model.score(X_test, y_test)

0.95650875

### Manage Class Imbalance 

- SMOTE does not handle class inbalance correctly for classes with only 1 value. Removing outlier

In [41]:
outlier_index = y_train[y_train == 5].index[0]

In [43]:
X_train.drop(labels = outlier_index, inplace = True)

In [44]:
y_train = y_train[y_train != 5]

In [45]:
y_train.value_counts()

2    1809408
1    1175000
3     156317
7      49823
6       9155
4        296
Name: Cover_Type, dtype: int64

- Implementing SMOTE

In [59]:
class_dict = {2: 1809408, 1:1175000, 3: 156317, 7:100000, 6: 100000, 4: 100000}

In [60]:
oversample = SMOTE(sampling_strategy = class_dict, k_neighbors = 7)
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [61]:
pd.Series(y_train).value_counts()

2    1809408
1    1175000
3     156317
7     100000
6     100000
4     100000
Name: Cover_Type, dtype: int64

### Train new model with balanced classes

- Training new RandomForest model with 50 estimators using the newly balanced class

In [62]:
model = RandomForestClassifier(n_estimators=50)

model.fit(X_train,y_train)

RandomForestClassifier(n_estimators=50)

In [None]:
model.score(X_test, y_test)

0.9550825

### Train model with scaled features

In [29]:
scaler = StandardScaler()

columns_to_scale = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
                    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points']

X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

In [31]:
model = RandomForestClassifier(n_estimators=60)

model.fit(X_train,y_train)

RandomForestClassifier(n_estimators=60)

In [33]:
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

model.score(X_test, y_test)

0.41250625

**_Using scaling impacts negatively and considerably the performance of the model. Will revert to model without scaling._**

### Lazy Predict

In [None]:
clf = LazyClassifier(verbose=1,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

  3%|██▍                                                                     | 1/29 [05:58<2:47:14, 358.36s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.87357375, 'Balanced Accuracy': 0.3140973943463404, 'ROC AUC': None, 'F1 Score': 0.8429344531028092, 'Time taken': 358.3599491119385}


  7%|████▉                                                                   | 2/29 [13:52<3:11:47, 426.20s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.95537625, 'Balanced Accuracy': 0.6938082699751155, 'ROC AUC': None, 'F1 Score': 0.9548439018740096, 'Time taken': 473.6722729206085}


 10%|███████▍                                                                | 3/29 [14:14<1:44:42, 241.64s/it]

{'Model': 'BernoulliNB', 'Accuracy': 0.8409075, 'Balanced Accuracy': 0.43597242717710927, 'ROC AUC': None, 'F1 Score': 0.83151644152671, 'Time taken': 22.001189947128296}


### SVM with Grid Search

In [None]:
svc = SVC()
param_grid = {'C': [1,10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']}
clf = GridSearchCV(svc, param_grid)
clf.fit(X_train, y_train)

### SGD Classification

In [None]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)

SGDClassifier()

In [None]:
sgd.score(X_test, y_test)

### Generate labels and file for submission

In [53]:
submission_df = pd.read_csv('data/test.csv')
X_submission = submission_df.drop(columns = ['Id'])
labels = model.predict(X_submission)

In [54]:
labels_pred_dict = {'Id': submission_df['Id'], 'Cover_Type': labels}

In [55]:
labels_predict_df = pd.DataFrame.from_dict(labels_pred_dict).set_index('Id') 

In [56]:
labels_predict_df.to_csv('data/submission2.csv')

### Submission Log

- Dec 15th, 2021 --> Score: **0.94865**
- Dec 16th, 2021 --> Score: **0.94756** - w/SMOTE