# Test Models

There are many different algorithms that can be used for classification problems. In this work, we aim to compare results by some of the main algorithms and evaluate the impact of our features treatment in each of them.

### Objectives
1. Evaluate different algorithms in terms of accuracy
2. Evaluate the impact of data categorization in algorithms

#### Tested Algorithms 
1. Logistic Regression
2. Gaussian Naive Bayes
3. KNN Classifier
4. Random Forest Classifier

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


## Algorithms with partially categorized features
Our first batch of tests will input datasets without full categorization. After that, we'll compare results with and without categorization.

In [2]:
results = pd.DataFrame([])

In [3]:
df_train = pd.read_csv('../data/interim/train.csv')
df_test = pd.read_csv('../data/interim/test.csv')

#### Prepare to train

In [4]:
X_train = df_train.drop(['Survived', 'PassengerId'], axis=1)
Y_train = df_train['Survived']

X_test = df_test.drop('PassengerId', axis=1)

X_train.head()

Unnamed: 0.1,Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,0
1,1,1,1,38.0,1,0,71.2833,2
2,2,3,1,26.0,0,0,7.925,0
3,3,1,1,35.0,1,0,53.1,0
4,4,3,0,35.0,0,0,8.05,0


#### Cross validation 70/30

In [5]:
b_fold = round(X_train.shape[0] * 0.3)

X_train_70 = X_train[:-b_fold]
Y_train_70 = Y_train[:-b_fold]
X_train_30 = X_train[-b_fold:]
Y_train_30 = Y_train[-b_fold:]

### Test Logistic Regression

In [6]:
log_regressor = LogisticRegression()
log_regressor.fit(X_train_70,Y_train_70)
prd = log_regressor.predict(X_test)

score = log_regressor.score(X_train_30,Y_train_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['Logistic Regression', score, 'Regular Features']], ignore_index=True)

Accuracy: 79.03%


### Test Naive Bayes

In [7]:
nb_regressor = GaussianNB()
nb_regressor.fit(X_train_70,Y_train_70)
prd = nb_regressor.predict(X_test)

score = nb_regressor.score(X_train_30,Y_train_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['Gaussian Naive Bayes', score, 'Regular Features']], ignore_index=True)

Accuracy: 80.90%


### Test KNN

In [8]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_70,Y_train_70)
prd = knn.predict(X_test)

score = knn.score(X_train_30,Y_train_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['KNN', score, 'Regular Features']], ignore_index=True)

Accuracy: 70.04%


### Test Random Forest

In [9]:
rf_regressor = RandomForestClassifier(n_estimators=100)
rf_regressor.fit(X_train_70,Y_train_70)
prd = rf_regressor.predict(X_test)

score = rf_regressor.score(X_train_30,Y_train_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['Random Forest', score, 'Regular Features']], ignore_index=True)

Accuracy: 84.64%


## Algorithms with totally categorized features and feature engineering

In [10]:
df_train_fe = pd.read_csv('../data/interim/train_fe.csv')
df_test_fe = pd.read_csv('../data/interim/test_fe.csv')

In [11]:
X_train_fe = df_train_fe.drop(['Survived', 'PassengerId'], axis=1)
Y_train_fe = df_train_fe['Survived']

X_test_fe = df_test_fe.drop('PassengerId', axis=1)

X_train_fe.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,hasSibSp,hasParch,AgeGroupI,AgeGroupII,AgeGroupIII
0,3,0,22,1,0,7.25,0,1,0,3.0,1.0,0.0
1,1,1,38,1,0,71.2833,2,1,0,4.0,4.0,3.0
2,3,1,26,0,0,7.925,0,0,0,3.0,2.0,1.0
3,1,1,35,1,0,53.1,0,1,0,4.0,3.0,2.0
4,3,0,35,0,0,8.05,0,0,0,4.0,3.0,2.0


In [12]:
X_test_fe.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeGroupI,AgeGroupII,AgeGroupIII,hasSibSp,hasParch
0,3,0,34,0,0,7.8292,1,4.0,3.0,2.0,0,0
1,3,1,47,1,0,7.0,0,4.0,4.0,3.0,1,0
2,2,0,62,0,0,9.6875,1,5.0,5.0,3.0,0,0
3,3,0,27,0,0,8.6625,0,3.0,2.0,1.0,0,0
4,3,1,22,1,1,12.2875,0,3.0,1.0,0.0,1,1


In [13]:
b_fold = round(X_train_fe.shape[0] * 0.3)

X_train_fe_70 = X_train_fe[:-b_fold]
Y_train_fe_70 = Y_train_fe[:-b_fold]
X_train_fe_30 = X_train_fe[-b_fold:]
Y_train_fe_30 = Y_train_fe[-b_fold:]

### Test Logistic Regression

In [14]:
log_regressor = LogisticRegression()
log_regressor.fit(X_train_fe_70,Y_train_fe_70)
prd = log_regressor.predict(X_test_fe)

score = log_regressor.score(X_train_fe_30,Y_train_fe_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['Logistic Regression', score, 'Feature Engineering']], ignore_index=True)

Accuracy: 82.40%


### Test Naive Bayes

In [15]:
nb_regressor = GaussianNB()
nb_regressor.fit(X_train_fe_70,Y_train_fe_70)
prd = nb_regressor.predict(X_test_fe)

score = nb_regressor.score(X_train_fe_30,Y_train_fe_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['Gaussian Naive Bayes', score, 'Feature Engineering']], ignore_index=True)

Accuracy: 80.52%


### Test KNN

In [16]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_fe_70,Y_train_fe_70)
prd = knn.predict(X_test_fe)

score = knn.score(X_train_fe_30,Y_train_fe_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['KNN', score,'Feature Engineering']], ignore_index=True)

Accuracy: 73.78%


### Test Random Forest

In [17]:
rf_regressor = RandomForestClassifier(n_estimators=100)
rf_regressor.fit(X_train_fe_70,Y_train_fe_70)
prd = rf_regressor.predict(X_test_fe)

score = rf_regressor.score(X_train_fe_30,Y_train_fe_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['Random Forest', score,'Feature Engineering']], ignore_index=True)

Accuracy: 82.40%


## Using only categorical features

In [18]:
X_train_fe_c = df_train_fe.drop(['Survived', 'PassengerId','Age', 'Fare'], axis=1)
Y_train_fe_c = df_train_fe['Survived']

X_test_fe_c = df_test_fe.drop(['PassengerId','Age', 'Fare'], axis=1)

X_train_fe_c.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,hasSibSp,hasParch,AgeGroupI,AgeGroupII,AgeGroupIII
0,3,0,1,0,0,1,0,3.0,1.0,0.0
1,1,1,1,0,2,1,0,4.0,4.0,3.0
2,3,1,0,0,0,0,0,3.0,2.0,1.0
3,1,1,1,0,0,1,0,4.0,3.0,2.0
4,3,0,0,0,0,0,0,4.0,3.0,2.0


In [19]:
b_fold = round(X_train_fe_c.shape[0] * 0.3)

X_train_fe_c_70 = X_train_fe_c[:-b_fold]
Y_train_fe_c_70 = Y_train_fe_c[:-b_fold]
X_train_fe_c_30 = X_train_fe_c[-b_fold:]
Y_train_fe_c_30 = Y_train_fe_c[-b_fold:]

### Test Logistic Regression

In [20]:
log_regressor = LogisticRegression()
log_regressor.fit(X_train_fe_c_70,Y_train_fe_c_70)
prd = log_regressor.predict(X_test_fe_c)

score = log_regressor.score(X_train_fe_c_30,Y_train_fe_c_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['Logistic Regression', score, 'Only categories']], ignore_index=True)

Accuracy: 81.65%


### Test Naive Bayes

In [21]:
nb_regressor = GaussianNB()
nb_regressor.fit(X_train_fe_c_70,Y_train_fe_c_70)
prd = nb_regressor.predict(X_test_fe_c)

score = nb_regressor.score(X_train_fe_c_30,Y_train_fe_c_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['Gaussian Naive Bayes', score, 'Only categories']], ignore_index=True)

Accuracy: 81.65%


### Test KNN

In [22]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_fe_c_70,Y_train_fe_c_70)
prd = knn.predict(X_test_fe_c)

score = knn.score(X_train_fe_c_70,Y_train_fe_c_70)

print(f"Accuracy: {score:.02%}")

results = results.append([['KNN', score,'Only categories']], ignore_index=True)

Accuracy: 85.10%


### Test Random Forest

In [23]:
rf_regressor = RandomForestClassifier(n_estimators=100)
rf_regressor.fit(X_train_fe_c_70,Y_train_fe_c_70)
prd = rf_regressor.predict(X_test_fe_c)

score = rf_regressor.score(X_train_fe_c_30,Y_train_fe_c_30)

print(f"Accuracy: {score:.02%}")

results = results.append([['Random Forest', score, 'Only categories']], ignore_index=True)

Accuracy: 83.15%


## Conclusion

In those tests, KNN result in better score.

In [24]:
results = results.rename(columns={0:'Algorithm', 
                        1:'Score', 
                        2:'Features'})
        
results.sort_values('Score', ascending=False)

Unnamed: 0,Algorithm,Score,Features
10,KNN,0.850962,Only categories
3,Random Forest,0.846442,Regular Features
11,Random Forest,0.831461,Only categories
4,Logistic Regression,0.82397,Feature Engineering
7,Random Forest,0.82397,Feature Engineering
8,Logistic Regression,0.816479,Only categories
9,Gaussian Naive Bayes,0.816479,Only categories
1,Gaussian Naive Bayes,0.808989,Regular Features
5,Gaussian Naive Bayes,0.805243,Feature Engineering
0,Logistic Regression,0.790262,Regular Features


## Finally, let's run our best predictions and submit!

In [26]:
rf_regressor = RandomForestClassifier(n_estimators=100)
rf_regressor.fit(X_train,Y_train)
prd = rf_regressor.predict(X_test)

df_pred = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': prd})

df_pred.to_csv('../data/processed/submission_RF.csv',index=False)

In [27]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_fe_c,Y_train_fe_c)
prd = knn.predict(X_test_fe_c)

df_pred = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': prd})

df_pred.to_csv('../data/processed/submission_knn.csv',index=False)