In [26]:
import util

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [28]:
df_train = pd.read_csv('train.csv', index_col=[0])
df_test = pd.read_csv('test.csv', index_col=[0])

df = pd.concat([df_train, df_test])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   int64  
 6   Parch     1309 non-null   int64  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 122.7+ KB


In [96]:
def get_target_avg_2features(df, features=[], target=None, aggregate='median'):
    feature_1 = features[0]
    feature_2 = features[1]
    results = []
    for i in df[feature_1].unique():
        for j in df[feature_2].unique():
            subset_feature1_feature2 = df[(df[feature_1] == i) & (df[feature_2] == j)]
            if aggregate == 'median':
                avg_value = round(subset_feature1_feature2[target].median(), 1)
            elif aggregate == 'mean':
                avg_value = round(subset_feature1_feature2[target].mean(), 1)
            elif aggregate == 'mode':
                avg_value = subset_feature1_feature2[target].mode()[0]
            else:
                raise ValueError(f'[{aggregate}] is an invalid perameter')
                
            results.append([i, j, avg_value])
            
    return pd.DataFrame(results, columns=[feature_1, feature_2, f'{target}_Average'])
             
        
get_target_avg_2features(df_train, ['Pclass', 'Survived'], 'Age')

Unnamed: 0,Pclass,Survived,Age_Average
0,3,0,25.0
1,3,1,22.0
2,1,0,45.2
3,1,1,35.0
4,2,0,30.5
5,2,1,28.0


In [75]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [105]:
df_clean = df.copy()

# Convert sex from object to integer
df_clean.Sex.replace('male', 1, inplace=True)
df_clean.Sex.replace('female', 0, inplace=True)

df_clean['Title'] = [ i.rsplit(', ')[1].rsplit('.')[0] for i in df_clean.Name.values]  # Extract titles

uncommon_titles = 'Rev,Dr,Col,Major,Mlle,Ms,Sir,Capt,Mme,Jonkheer,Lady,the Countess,Don,Dona'.split(',')  
df_clean['Title'] = ['uncommon' if i in uncommon_titles else i for i in df_clean.Title.values]  # Group uncommon titles


# Impute median age for df_clean based on Pclass and Sex
# Todo: tidy code block{
null_age = df_clean[df_clean.Age.isnull()].copy()
for pclass in df_clean.Pclass.unique():
        for gender in df_clean.Sex.unique():
            subset_sex_pclass = df_clean[(df_clean.Sex == gender) & (df_clean.Pclass == pclass)]
            impute_value = round(subset_sex_pclass.Age.median(), 1)
            
            null_age.loc[(df_clean.Sex == gender) & (df_clean.Pclass == pclass), 'Age'] = impute_value
df_clean.loc[null_age.index, 'Age'] = [i for i in null_age.Age.values]
# }


df_clean.Embarked.replace(['S', 'C', 'Q'], [1,2,3], inplace=True)
df_clean.Embarked.fillna(1, inplace=True)

# df_clean.drop('Name', axis=1, inplace=True)  # No more 'value' to be extracted from name
df_clean.drop('Cabin', axis=1, inplace=True)  # 77% NaN anyway, could impute data based on fare or title?
df_clean.drop('Ticket', axis=1, inplace=True)  # Unsure of how to use this feature at the moment

df_clean.kurtosis()  # Highly skewed SibSp, Parch, Fare[Can be helped by droping outliers]

df_clean['Infant'] = [1 if i < 1 else 0 for i in df_clean.Age] 
df_clean['Child'] = [1 if i in range(1, 13) else 0 for i in df_clean.Age]
df_clean['Teen'] = [1 if i in range(13, 19) else 0 for i in df_clean.Age]
df_clean['YoungAdult'] = [1 if i in range(19, 30) else 0 for i in df_clean.Age]
df_clean['Adult'] = [1 if i in range(30, 40) else 0 for i in df_clean.Age]
df_clean['Age40+'] = [1 if i >= 40 else 0 for i in df_clean.Age]

util.outliers_percentage(df_clean)

Survived       0.00
Pclass         0.00
Sex            0.00
Age            7.56
SibSp          2.83
Parch         10.47
Fare          15.74
Embarked       0.00
Infant         0.00
Child          0.00
Teen           0.00
YoungAdult     0.00
Adult          0.00
Age40+         0.00
dtype: float64

In [None]:
df_clean = pd.get_dummies(df_clean, drop_first=True)

train = df_clean[:len(df_train)]
test = df_clean[len(df_train):].drop('Survived', axis=1)

## Testing model performance

In [1]:
# x = train.drop('Survived', axis=1)
# y = train.Survived

# cv = 10
# log_reg_scores = cross_val_score(LogisticRegression(solver='liblinear'), x, y, cv=cv)
# tree_classifier_scores = cross_val_score(DecisionTreeClassifier(max_depth=4), x, y, cv=cv)
# rand_forest_scores = cross_val_score(RandomForestClassifier(max_depth=10), x, y, cv=cv)
# k_neighbors = cross_val_score(KNeighborsClassifier(n_neighbors=5), x, y, cv=cv)

# scores_df = pd.DataFrame({
#             'LogisticRegression':log_reg_scores,
#             'DecisionTreeClassifier':tree_classifier_scores,
#             'RandomForestClassifier':rand_forest_scores,
#             'KNeighborsClassifier':k_neighbors
#                             })
# scores_df.describe().drop('count')

In [2]:
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report
# from sklearn.feature_selection import VarianceThreshold

# x = train.drop(['Survived'], axis=1)
# y = train.Survived

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.1)

# #Feature selection Variance
# var_threshold = VarianceThreshold(0.6).fit(x_train)
# x_train = var_threshold.transform(x_train)
# x_test = var_threshold.transform(x_test)

# stand_scaler = StandardScaler().fit(x_train)

# x_train_stand = stand_scaler.transform(x_train)
# x_test_stand = stand_scaler.transform(x_test)
# #------------------------------------------------------------

# log_reg = LogisticRegression().fit(x_train_stand, y_train)
# test_pred = log_reg.predict(x_test)


# print(classification_report(y_test, test_pred, target_names=['Died', 'Survived']))