Kaggle Score: ...

Key differences in this notebook: 
- using train + test data **together** for feature imputation
- using the full train set to train the model, no validation set 
- using the age from title method from here: https://www.kaggle.com/code/vivovinco/titanic-endless-predictions 

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# need this to be able to import processor module
import os
import sys
sys.path.insert(0, os.path.abspath('../modules'))
import preprocess as pp

In [2]:

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

np.random.seed(35)

from sklearn.neighbors import KNeighborsClassifier, KNeighborsTransformer

grid= {"n_neighbors": np.arange(8, 20, 2), 
        "leaf_size": np.arange(1, 40, 4),
        "weights": ["uniform", "distance"], 
        'p': [1,2],
        'metric': ['minkowski', 'chebyshev', 'correlation', 'dice']}

gs_model = GridSearchCV(KNeighborsClassifier(), 
                        param_grid=grid,
                        cv= 5)

In [3]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
mm_scaler = MinMaxScaler()
std_scaler = StandardScaler()

## Import Data & Infer Features

In [4]:
df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

test_PassengerIDs = test_df.PassengerId

df = df.append(test_df)
df["Family"] = df.SibSp+df.Parch
df.Embarked.fillna("X", inplace=True)
df.Fare.fillna(df.Fare.median(), inplace=True)
df[['LastName','TitleFirstName']] = df.Name.str.split(',', expand=True)
df['Title'] = df.TitleFirstName.apply(lambda x: x.split('.') [0])
df.Title = df.Title.str.replace(' ', '')
              
df = pp.infer_cabin_features(df, mark_missing=False)

df.replace(['male','female'],[0,1],inplace=True)

df.isna().sum()

  df = df.append(test_df)


PassengerId         0
Survived          418
Pclass              0
Name                0
Sex                 0
Age               263
SibSp               0
Parch               0
Ticket              0
Fare                0
Cabin               0
Embarked            0
Family              0
LastName            0
TitleFirstName      0
Title               0
Deck                0
dtype: int64

In [5]:
# Replacing rare titles with more common ones
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
df.replace({'Title': mapping}, inplace=True)

titles = ['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev']

for title in titles:
    age_to_impute = df.groupby('Title')['Age'].median()[titles.index(title)]
    df.loc[(df.Age.isna() & (df.Title == title)), 'Age'] = age_to_impute
    

df = pp.set_age_groups(df)
df = pp.set_fare_groups(df)
df.isna().sum()

PassengerId         0
Survived          418
Pclass              0
Name                0
Sex                 0
Age                 0
SibSp               0
Parch               0
Ticket              0
Fare                0
Cabin               0
Embarked            0
Family              0
LastName            0
TitleFirstName      0
Title               0
Deck                0
AgeGroup            0
FareGroup           0
dtype: int64

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family,LastName,TitleFirstName,Title,Deck,AgeGroup,FareGroup
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,unknown,S,1,Braund,Mr. Owen Harris,Mr,unknown,2,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1,Cumings,Mrs. John Bradley (Florence Briggs Thayer),Mrs,C,5,5
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,unknown,S,0,Heikkinen,Miss. Laina,Miss,unknown,3,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1,Futrelle,Mrs. Jacques Heath (Lily May Peel),Mrs,C,4,5
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,unknown,S,0,Allen,Mr. William Henry,Mr,unknown,4,1


## Experiment 1: Sub age and fare with categories

Take the features that I think make sense to include:
- Pclass
- Sex
- Embarked
- Family
- Age Group
- Fare Group
- Deck


In [7]:
df1 = df.copy()
df1 = pp.numerify_categorical_columns_0(df1, columns=['Embarked', 'Title', 'Deck'])
df1 = df1.drop(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Fare', 'Ticket', 'LastName','Cabin', 'TitleFirstName'], axis=1)

In [8]:
# scale with Min-Max Scaler 
df1_mm = df1.copy()
scale_arr = mm_scaler.fit_transform(df1_mm)
df1_mm = pd.DataFrame(scale_arr)
df1_mm.columns = df1.columns
df1_mm.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,Family,Title,Deck,AgeGroup,FareGroup
0,0.0,1.0,0.0,0.666667,0.1,0.5,1.0,0.2,0.0
1,1.0,0.0,1.0,0.0,0.1,0.666667,0.294118,0.8,1.0
2,1.0,1.0,1.0,0.666667,0.0,0.333333,1.0,0.4,0.0
3,1.0,0.0,1.0,0.666667,0.1,0.666667,0.294118,0.6,1.0
4,0.0,1.0,0.0,0.666667,0.0,0.5,1.0,0.6,0.0


In [9]:
# train with Min-Max Scaler 

df1_mm_train = df1_mm[:891]
df1_mm_test = df1_mm[891:]
df1_mm_test = df1_mm_test.drop("Survived", axis=1)

df1_mm_x = df1_mm_train.drop("Survived", axis=1)
df1_mm_y = df1_mm_train.Survived

gs_model.fit(df1_mm_x, df1_mm_y)

gs_model.best_score_, gs_model.best_params_

(0.8282970309459545,
 {'leaf_size': 1,
  'metric': 'minkowski',
  'n_neighbors': 12,
  'p': 1,
  'weights': 'uniform'})

In [19]:
# scale with Standard Scaler
df1_std = df1.copy()

# split out the survived col
df1_std_x = df1_std.drop("Survived", axis=1)
df1_std_y = df1_std.Survived
original_x_col_names = df1_std_x.columns

# grab Y
df1_std_y = df1_std_y[:891]

# scale features ONLY
scale_arr = std_scaler.fit_transform(df1_std_x)
df1_std_x = pd.DataFrame(scale_arr)
df1_std_x.columns = original_x_col_names

# split scaled x into train and test
df1_std_x = df1_std_x[:891]
df1_std_test = df1_std_x[891:]

df1_std_x.head()

Unnamed: 0,Pclass,Sex,Embarked,Family,Title,Deck,AgeGroup,FareGroup
0,0.841916,-0.743497,0.618968,0.073352,0.189485,0.501055,-0.916634,-0.956908
1,-1.546098,1.344995,-1.832558,0.073352,1.474648,-1.888312,0.951358,1.796043
2,0.841916,1.344995,0.618968,-0.558346,-1.095677,0.501055,-0.29397,-0.956908
3,-1.546098,1.344995,0.618968,0.073352,1.474648,-1.888312,0.328694,1.796043
4,0.841916,-0.743497,0.618968,-0.558346,0.189485,0.501055,0.328694,-0.956908


In [20]:
# train with Standard Scaler

gs_model.fit(df1_std_x, df1_std_y)

gs_model.best_score_, gs_model.best_params_

(0.8350260498399347,
 {'leaf_size': 13,
  'metric': 'minkowski',
  'n_neighbors': 12,
  'p': 2,
  'weights': 'uniform'})

## Experiment 2: take the 5 most highly correlated features to survival

1. Sex
2. Pclass
3. Fare Group
4. Deck
5. Title

In [26]:
df2 = df.copy()
df2 = pp.numerify_categorical_columns_0(df2, columns=['Deck', 'Title'])
df2 = df2.drop(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Fare', 'Family', 'AgeGroup', 'Embarked', 'Cabin', 'Ticket', 'LastName', 'TitleFirstName'], axis=1)

In [27]:
# scale with Min-Max Scaler 
df2_mm = df2.copy()
scale_arr = mm_scaler.fit_transform(df2_mm)
df2_mm = pd.DataFrame(scale_arr)
df2_mm.columns = df2.columns
df2_mm.head()

Unnamed: 0,Survived,Pclass,Sex,Title,Deck,FareGroup
0,0.0,1.0,0.0,0.5,1.0,0.0
1,1.0,0.0,1.0,0.666667,0.294118,1.0
2,1.0,1.0,1.0,0.333333,1.0,0.0
3,1.0,0.0,1.0,0.666667,0.294118,1.0
4,0.0,1.0,0.0,0.5,1.0,0.0


In [28]:
# train with Min-Max Scaler 

df2_mm_train = df2_mm[:891]
df2_mm_test = df2_mm[891:]

df2_mm_x = df2_mm_train.drop("Survived", axis=1)
df2_mm_y = df2_mm_train.Survived

df2_mm_test = df2_mm_test.drop("Survived", axis=1)

gs_model.fit(df2_mm_x, df2_mm_y)

gs_model.best_score_, gs_model.best_params_

(0.8215491808423827,
 {'leaf_size': 9,
  'metric': 'minkowski',
  'n_neighbors': 12,
  'p': 1,
  'weights': 'distance'})

In [30]:
# scale with Standard Scaler
df2_std = df2.copy()

# split out the survived col
df2_std_x = df2_std.drop("Survived", axis=1)
df2_std_y = df2_std.Survived
original_x_col_names = df2_std_x.columns

# grab Y
df2_std_y = df2_std_y[:891]

# scale features ONLY
scale_arr = std_scaler.fit_transform(df2_std_x)
df2_std_x = pd.DataFrame(scale_arr)
df2_std_x.columns = original_x_col_names

# split scaled x into train and test
df2_std_x = df2_std_x[:891]
df2_std_test = df2_std_x[891:]

df2_std_x.head()

Unnamed: 0,Pclass,Sex,Title,Deck,FareGroup
0,0.841916,-0.743497,0.189485,0.501055,-0.956908
1,-1.546098,1.344995,1.474648,-1.888312,1.796043
2,0.841916,1.344995,-1.095677,0.501055,-0.956908
3,-1.546098,1.344995,1.474648,-1.888312,1.796043
4,0.841916,-0.743497,0.189485,0.501055,-0.956908


In [31]:
# train with Standard Scaler
gs_model.fit(df2_std_x, df2_std_y)

gs_model.best_score_, gs_model.best_params_

(0.821542903772519,
 {'leaf_size': 25,
  'metric': 'minkowski',
  'n_neighbors': 12,
  'p': 2,
  'weights': 'distance'})

## Test Step

Take the better model params and run model with them against the corresponding test data

best score: 0.8350260498399347

best model params:
 {'leaf_size': 13,
  'metric': 'minkowski',
  'n_neighbors': 12,
  'p': 2,
  'weights': 'uniform'}

data transform: df_std_1 - standard scaled with Pclass,	Sex,	Embarked,	Family,	Title,	Deck,	AgeGroup,	FareGroup

In [33]:
df1_std_test.head()

Unnamed: 0,Pclass,Sex,Title,Deck,FareGroup


In [32]:
model = KNeighborsClassifier(leaf_size=13, metric='minkowski', n_neighbors=12, p=2, weights='uniform')

model.fit(df1_std_x, df1_std_y)
preds = model.predict(df1_std_test)

Feature names seen at fit time, yet now missing:
- AgeGroup
- Embarked
- Family



ValueError: Found array with 0 sample(s) (shape=(0, 5)) while a minimum of 1 is required.

In [None]:
best_score = 0.8237900947837549

In [None]:
kaggle_data = pd.DataFrame()
kaggle_data["PassengerId"] = test_PassengerIDs
kaggle_data["Survived"] = preds.astype(int)

In [None]:
ss_score_percent = round(best_score, 3)*100
#kaggle_data.to_csv("../result-csv/KNNv3-std-"+str(ss_score_percent)+"p_accuracy.csv", index=False)