Kaggle Score: 0.76315

In [71]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# need this to be able to import processor module
import os
import sys
sys.path.insert(0, os.path.abspath('../modules'))
import preprocess as pp

## Notebook objectives


1. try Age, Fare, Ticket, Deck features. 
2. predict test age based on train data age medians

In [72]:
df_raw = pd.read_csv("../data/train.csv")

df = df_raw.copy()
df["Family"] = df.SibSp+df.Parch
df.Embarked.fillna("X", inplace=True)
df[['LastName','TitleFirstName']] = df.Name.str.split(',', expand=True)
df['Title'] = df.TitleFirstName.apply(lambda x: x.split('.') [0])
        
df = pp.infer_cabin_features(df, mark_missing=False)
df = df.drop(['Name', 'SibSp', "Parch", 'TitleFirstName', 'LastName', 'Cabin', 'PassengerId'], axis=1)

df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Ticket        0
Fare          0
Embarked      0
Family        0
Title         0
Deck          0
dtype: int64

In [73]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,Family,Title,Deck
0,0,3,male,22.0,A/5 21171,7.25,S,1,Mr,unknown
1,1,1,female,38.0,PC 17599,71.2833,C,1,Mrs,C
2,1,3,female,26.0,STON/O2. 3101282,7.925,S,0,Miss,unknown
3,1,1,female,35.0,113803,53.1,S,1,Mrs,C
4,0,3,male,35.0,373450,8.05,S,0,Mr,unknown


In [74]:
num_df = df.copy()
num_df = pp.numerify_categorical_columns_0(num_df, columns=["Sex", "Embarked", "Title", "Ticket", "Deck",])
num_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,Family,Title,Deck
0,0,3,1,22.0,523,7.25,2,1,11,16
1,1,1,0,38.0,596,71.2833,0,1,12,5
2,1,3,0,26.0,669,7.925,2,0,8,16
3,1,1,0,35.0,49,53.1,2,1,12,5
4,0,3,1,35.0,472,8.05,2,0,11,16


In [75]:
sex_arr = num_df.Sex.unique()
fam_arr = np.arange(0, 11, 1)
median_ages = np.zeros(( len(sex_arr),len(fam_arr) ))


for sex in sex_arr:
    sex_median = num_df.loc[num_df.Sex == sex].Age.median()
    for family in fam_arr:
        pred_age = num_df.loc[((num_df.Family == family) & (num_df.Sex == sex))].Age.median()
        if pred_age > 0:
            median_ages[sex][family] = pred_age
            #print(f"sex {sex}, fam {family} --> age {median_ages[sex][family]}")
        else:
            median_ages[sex][family] = sex_median
            #print(f"sex {sex}, fam {family}* --> age {median_ages[sex][family]}")

median_ages

def fill_na_with_median(row):
    if row.Age > 0:
        return row.Age
    else:
        return median_ages[int(row.Sex)][int(row.Family)]


median_ages

array([[28.5, 28. , 28.5, 23. , 21. , 24. , 10. , 29.5, 27. , 27. , 27. ],
       [30. , 31. , 24. , 23. ,  9. ,  7.5,  6.5, 10. , 29. , 29. , 29. ]])

In [76]:
num_df.Age = num_df.apply(lambda row: fill_na_with_median(row), axis=1 )

In [77]:
num_df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Ticket      0
Fare        0
Embarked    0
Family      0
Title       0
Deck        0
dtype: int64

In [78]:
num_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,Family,Title,Deck
0,0,3,1,22.0,523,7.25,2,1,11,16
1,1,1,0,38.0,596,71.2833,0,1,12,5
2,1,3,0,26.0,669,7.925,2,0,8,16
3,1,1,0,35.0,49,53.1,2,1,12,5
4,0,3,1,35.0,472,8.05,2,0,11,16


# Unstandardized Data KNN

In [79]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

np.random.seed(35)

x = num_df.drop("Survived", axis=1)
y = num_df.Survived

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

In [80]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsTransformer

grid= {"n_neighbors": np.arange(8, 20, 2), 
        "leaf_size": np.arange(1, 7, 2),
        "weights": ["uniform", "distance"], 
        'p': [1,2],
        'metric': ['minkowski', 'chebyshev', 'correlation', 'dice']}

gs_model = GridSearchCV(KNeighborsClassifier(), 
                        param_grid=grid,
                        cv= 5)

gs_model.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': array([1, 3, 5]),
                         'metric': ['minkowski', 'chebyshev', 'correlation',
                                    'dice'],
                         'n_neighbors': array([ 8, 10, 12, 14, 16, 18]),
                         'p': [1, 2], 'weights': ['uniform', 'distance']})

In [81]:
models = {"Plain KNN": {
    "data_transform": "none",
    "best_score": gs_model.best_score_,
    "test_score": gs_model.score(x_test, y_test),
    "best_params": gs_model.best_params_
}}

best_score = gs_model.best_score_

models

{'Plain KNN': {'data_transform': 'none',
  'best_score': 0.792169802028957,
  'test_score': 0.7653631284916201,
  'best_params': {'leaf_size': 1,
   'metric': 'dice',
   'n_neighbors': 16,
   'p': 1,
   'weights': 'uniform'}}}

In [82]:
y_preds = gs_model.predict(x_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.83      0.76      0.79       106
           1       0.69      0.77      0.73        73

    accuracy                           0.77       179
   macro avg       0.76      0.77      0.76       179
weighted avg       0.77      0.77      0.77       179



# Min-max scaler 

let's do this one just for comparison b/c it improved the scores last time

In [83]:
# Scale it!!
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

mm_scale_df = num_df.copy()
scale_arr = scaler.fit_transform(mm_scale_df)
mm_scale_df = pd.DataFrame(scale_arr)
mm_scale_df.columns = num_df.columns
mm_scale_df.head()



Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Embarked,Family,Title,Deck
0,0.0,1.0,1.0,0.271174,0.769118,0.014151,0.666667,0.1,0.6875,1.0
1,1.0,0.0,0.0,0.472229,0.876471,0.139136,0.0,0.1,0.75,0.3125
2,1.0,1.0,0.0,0.321438,0.983824,0.015469,0.666667,0.0,0.5,1.0
3,1.0,0.0,0.0,0.434531,0.072059,0.103644,0.666667,0.1,0.75,0.3125
4,0.0,1.0,1.0,0.434531,0.694118,0.015713,0.666667,0.0,0.6875,1.0


In [84]:
mms_df_x = mm_scale_df.drop("Survived", axis=1)
mms_df_y = mm_scale_df.Survived

mms_x_train, mms_x_test, mms_y_train, mms_y_test = train_test_split(mms_df_x, mms_df_y, test_size=.2 )

mms_gs_model = GridSearchCV(KNeighborsClassifier(), 
                        param_grid=grid,
                        cv= 5)

mms_gs_model.fit(mms_x_train, mms_y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': array([1, 3, 5]),
                         'metric': ['minkowski', 'chebyshev', 'correlation',
                                    'dice'],
                         'n_neighbors': array([ 8, 10, 12, 14, 16, 18]),
                         'p': [1, 2], 'weights': ['uniform', 'distance']})

In [85]:
mms_test_score = mms_gs_model.score(mms_x_test, mms_y_test)
models["MinMax Scaled"] =  {
    "data_transform": "MinMax",
    "best_score": mms_gs_model.best_score_,
    "test_score": mms_test_score,
    "best_params": mms_gs_model.best_params_
}

if mms_test_score > best_score:
    best_score = mms_test_score
    
models

{'Plain KNN': {'data_transform': 'none',
  'best_score': 0.792169802028957,
  'test_score': 0.7653631284916201,
  'best_params': {'leaf_size': 1,
   'metric': 'dice',
   'n_neighbors': 16,
   'p': 1,
   'weights': 'uniform'}},
 'MinMax Scaled': {'data_transform': 'MinMax',
  'best_score': 0.8145769723234512,
  'test_score': 0.8156424581005587,
  'best_params': {'leaf_size': 1,
   'metric': 'minkowski',
   'n_neighbors': 18,
   'p': 1,
   'weights': 'uniform'}}}

In [86]:
y_preds = gs_model.predict(mms_x_test)
print(classification_report(mms_y_test, y_preds))

              precision    recall  f1-score   support

         0.0       0.80      0.84      0.82       110
         1.0       0.72      0.67      0.69        69

    accuracy                           0.77       179
   macro avg       0.76      0.75      0.75       179
weighted avg       0.77      0.77      0.77       179



# Standard Scaler

In [87]:
# Scale it!!

from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()

num_df_x = num_df.drop("Survived", axis=1)
ss_df_y = num_df.Survived

scale_arr = std_scaler.fit_transform(num_df_x)
ss_df_x = pd.DataFrame(scale_arr)
ss_df_x.columns = num_df_x.columns
ss_df_x.head()



Unnamed: 0,Pclass,Sex,Age,Ticket,Fare,Embarked,Family,Title,Deck
0,0.827377,0.737695,-0.580613,0.918966,-0.502445,0.581114,0.05916,0.414641,0.500962
1,-1.566107,-1.355574,0.646611,1.282625,0.786845,-1.93846,0.05916,0.961158,-1.851663
2,0.827377,-1.355574,-0.273807,1.646283,-0.488854,0.581114,-0.560975,-1.224909,0.500962
3,-1.566107,-1.355574,0.416507,-1.442322,0.42073,0.581114,0.05916,0.961158,-1.851663
4,0.827377,0.737695,0.416507,0.664904,-0.486337,0.581114,-0.560975,0.414641,0.500962


In [88]:
ss_x_train, ss_x_test, ss_y_train, ss_y_test = train_test_split(ss_df_x, ss_df_y, test_size=.2 )


ss_gs_model = GridSearchCV(KNeighborsClassifier(), 
                        param_grid=grid,
                        cv= 5)

ss_gs_model.fit(ss_x_train, ss_y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': array([1, 3, 5]),
                         'metric': ['minkowski', 'chebyshev', 'correlation',
                                    'dice'],
                         'n_neighbors': array([ 8, 10, 12, 14, 16, 18]),
                         'p': [1, 2], 'weights': ['uniform', 'distance']})

In [89]:
ss_test_score = ss_gs_model.score(ss_x_test, ss_y_test)

models["Standard Scaled"] =  {
    "data_transform": "Standard",
    "best_score": ss_gs_model.best_score_,
    "test_score": ss_test_score,
    "best_params": ss_gs_model.best_params_
}


if ss_test_score > best_score:
    best_score = ss_test_score
    
models

{'Plain KNN': {'data_transform': 'none',
  'best_score': 0.792169802028957,
  'test_score': 0.7653631284916201,
  'best_params': {'leaf_size': 1,
   'metric': 'dice',
   'n_neighbors': 16,
   'p': 1,
   'weights': 'uniform'}},
 'MinMax Scaled': {'data_transform': 'MinMax',
  'best_score': 0.8145769723234512,
  'test_score': 0.8156424581005587,
  'best_params': {'leaf_size': 1,
   'metric': 'minkowski',
   'n_neighbors': 18,
   'p': 1,
   'weights': 'uniform'}},
 'Standard Scaled': {'data_transform': 'Standard',
  'best_score': 0.8020092583472864,
  'test_score': 0.8547486033519553,
  'best_params': {'leaf_size': 1,
   'metric': 'minkowski',
   'n_neighbors': 10,
   'p': 2,
   'weights': 'uniform'}}}

In [90]:
y_preds = ss_gs_model.predict(ss_x_test)
print(classification_report(ss_y_test, y_preds))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89       119
           1       0.77      0.80      0.79        60

    accuracy                           0.85       179
   macro avg       0.84      0.84      0.84       179
weighted avg       0.86      0.85      0.86       179



# Kaggle Test data...

we'll do standard scaled knn

In [91]:
raw_test_df = pd.read_csv("../data/test.csv")
test_PassengerIDs = raw_test_df.PassengerId


test_df = raw_test_df.copy()
test_df["Family"] = test_df.SibSp+test_df.Parch
test_df.Embarked.fillna("X", inplace=True)
test_df[['LastName','TitleFirstName']] = test_df.Name.str.split(',', expand=True)
test_df['Title'] = test_df.TitleFirstName.apply(lambda x: x.split('.') [0])
        
test_df = pp.infer_cabin_features(test_df, mark_missing=False)
test_df = test_df.drop(['Name', 'SibSp', "Parch", 'TitleFirstName', 'LastName', 'Cabin', 'PassengerId'], axis=1)
test_df.head()



Unnamed: 0,Pclass,Sex,Age,Ticket,Fare,Embarked,Family,Title,Deck
0,3,male,34.5,330911,7.8292,Q,0,Mr,unknown
1,3,female,47.0,363272,7.0,S,1,Mrs,unknown
2,2,male,62.0,240276,9.6875,Q,0,Mr,unknown
3,3,male,27.0,315154,8.6625,S,0,Mr,unknown
4,3,female,22.0,3101298,12.2875,S,2,Mrs,unknown


In [92]:
test_df.Fare.fillna(num_df.Fare.median(), inplace=True)
test_df = pp.numerify_categorical_columns_0(test_df, columns=["Sex", "Embarked", "Title", "Ticket", "Deck",])
test_df.Age = test_df.apply(lambda row: fill_na_with_median(row), axis=1 )
test_df.head()

Unnamed: 0,Pclass,Sex,Age,Ticket,Fare,Embarked,Family,Title,Deck
0,3,1,34.5,152,7.8292,1,0,5,16
1,3,0,47.0,221,7.0,2,1,6,16
2,2,1,62.0,73,9.6875,1,0,5,16
3,3,1,27.0,147,8.6625,2,0,5,16
4,3,0,22.0,138,12.2875,2,2,6,16


In [93]:
test_df.isna().sum()

Pclass      0
Sex         0
Age         0
Ticket      0
Fare        0
Embarked    0
Family      0
Title       0
Deck        0
dtype: int64

### let's start with standard scaler, since its test score was so high

In [94]:
std_scaler = StandardScaler()

ss_test_df = test_df.copy()
test_scale_arr = std_scaler.fit_transform(ss_test_df)
ss_test_df = pd.DataFrame(test_scale_arr)
ss_test_df.columns = test_df.columns
ss_test_df.head()


Unnamed: 0,Pclass,Sex,Age,Ticket,Fare,Embarked,Family,Title,Deck
0,0.873482,0.755929,0.353767,-0.269494,-0.497413,-0.470915,-0.553443,0.155072,0.48788
1,0.873482,-1.322876,1.341178,0.372934,-0.512278,0.700767,0.105643,1.292265,0.48788
2,-0.315819,0.755929,2.526071,-1.005027,-0.4641,-0.470915,-0.553443,0.155072,0.48788
3,0.873482,0.755929,-0.238679,-0.316046,-0.482475,0.700767,-0.553443,0.155072,0.48788
4,0.873482,-1.322876,-0.633644,-0.399841,-0.417492,0.700767,0.764728,1.292265,0.48788


In [95]:
test_preds = ss_gs_model.predict(ss_test_df)

In [96]:
kaggle_data = pd.DataFrame()
kaggle_data["PassengerId"] = test_PassengerIDs
kaggle_data["Survived"] = test_preds.astype(int)

In [97]:
ss_score_percent = round(best_score, 3)*100
kaggle_data.to_csv("../result-csv/KNNv2-std-"+str(ss_score_percent)+"p_accuracy.csv", index=False)

### let's try the min-max scaler... see what that does

In [98]:
mm_scaler = MinMaxScaler()

mm_test_df = test_df.copy()
test_scale_arr = mm_scaler.fit_transform(mm_test_df)
mm_test_df = pd.DataFrame(test_scale_arr)
mm_test_df.columns = test_df.columns
mm_test_df.head()


Unnamed: 0,Pclass,Sex,Age,Ticket,Fare,Embarked,Family,Title,Deck
0,1.0,1.0,0.452723,0.41989,0.015282,0.5,0.0,0.625,1.0
1,1.0,0.0,0.617566,0.610497,0.013663,1.0,0.1,0.75,1.0
2,0.5,1.0,0.815377,0.201657,0.018909,0.5,0.0,0.625,1.0
3,1.0,1.0,0.353818,0.406077,0.016908,1.0,0.0,0.625,1.0
4,1.0,0.0,0.287881,0.381215,0.023984,1.0,0.2,0.75,1.0


In [99]:
test_preds = mms_gs_model.predict(mm_test_df)

In [100]:
kaggle_data = pd.DataFrame()
kaggle_data["PassengerId"] = test_PassengerIDs
kaggle_data["Survived"] = test_preds.astype(int)

In [101]:
mms_score_percent = round(models["MinMax Scaled"]["test_score"], 3)*100
kaggle_data.to_csv("../result-csv/KNNv2-mms-"+str(mms_score_percent)+"p_accuracy.csv", index=False)