In [314]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Data Processing (dropped na's)

In [709]:
train_df = pd.read_csv('train.csv')

In [710]:
train_df = train_df.drop(columns=['Name', 'PassengerId', 'Ticket', 'Cabin'])

In [711]:
train_df.isna().any()

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Embarked     True
dtype: bool

In [712]:
train_df = train_df.dropna()

In [713]:
train_df.loc[:, 'Sex'] = train_df['Sex'].replace(['male', 'female'], [1, 0])

In [714]:
train_df.loc[:, 'Embarked'] = train_df['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2])

In [716]:
X = train_df.loc[:, 'Pclass':'Embarked'].values
y = train_df['Survived'].values

# Data processing without dropping na's

## Prepping datasets

In [736]:
train_df = pd.read_csv('train.csv')

In [737]:
train_df = train_df.drop(columns=['Name', 'PassengerId', 'Ticket', 'Cabin', 'Survived'])
train_df['female'] = train_df['Sex'].replace(['female', 'male'], [1, 0])
train_df['male'] = train_df['Sex'].replace(['female', 'male'], [0, 1])
train_df['third_class'] = train_df['Pclass'].replace([1, 2, 3], [0, 0, 1])
train_df['second_class'] = train_df['Pclass'].replace([1, 2, 3], [0, 1, 0])
train_df['first_class'] = train_df['Pclass'].replace([1, 2, 3], [1, 0, 0])
train_df = train_df.drop(columns=['Pclass', 'Sex'])
train_df.loc[:, 'Embarked'] = train_df['Embarked'].replace(['S', 'C', 'Q'], [int(0), int(1), int(2)])

In [738]:
train_df.isna().any()

Age              True
SibSp           False
Parch           False
Fare            False
Embarked         True
female          False
male            False
third_class     False
second_class    False
first_class     False
dtype: bool

In [742]:
pred_age = train_df[train_df['Age'].isna()].drop(columns=['Embarked', 'Age'])
pred_emb = train_df[train_df['Embarked'].isna()].drop(columns=['Embarked', 'Age'])

In [743]:
train_age = train_df.drop(index=pred_df_age.index).drop(columns=['Embarked'])
train_emb = train_df.drop(index=pred_df_emb.index).drop(columns=['Age'])

In [741]:
train_emb['Embarked'] = train_emb['Embarked'].astype('int64')

## Filling gaps with KNN

In [744]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [496]:
X_train_age = train_age.drop(columns='Age').values
y_train_age = train_age['Age'].values
X_train_emb = train_emb.drop(columns='Embarked').values
y_train_emb = train_emb['Embarked'].values

X_pred_age = pred_age.values
X_pred_emb = pred_emb.values

In [513]:
knc = KNeighborsClassifier()
knr = KNeighborsRegressor()
lnr = LinearRegression()

In [748]:
params = [{'n_neighbors': [1, 2, 3, 4, 5, 6]}]

In [531]:
gsc = GridSearchCV(estimator=knc, param_grid=params, scoring='accuracy', cv=25)
gsc.fit(X_train_emb, y_train_emb)
print(gsc.best_params_)
print(gsc.score(X_train_emb, y_train_emb))

{'n_neighbors': 1}
0.9493813273340832


In [537]:
gsr = GridSearchCV(estimator=knr, param_grid=params, scoring='r2', cv=25)
gsr.fit(X_train_age, y_train_age)
print(gsr.best_params_)
print(gsr.score(X_train_age, y_train_age))

{'n_neighbors': 5}
0.3817154025791778


In [544]:
sub_vals_emb = pd.Series(gsc.predict(X_pred_emb), index=pred_emb.index)

In [555]:
filled_emb_col = train_df['Embarked'].fillna(sub_vals_emb)

In [558]:
sub_vals_age = pd.Series(gsr.predict(X_pred_age), index=pred_age.index)

In [560]:
filled_age_col = train_df['Age'].fillna(sub_vals_age)

In [561]:
train_no_na = train_df.copy()
train_no_na['Age'] = filled_age_col
train_no_na['Embarked'] = filled_emb_col

In [565]:
train_no_na.isna().any()

Age             False
SibSp           False
Parch           False
Fare            False
Embarked        False
female          False
male            False
third_class     False
second_class    False
first_class     False
dtype: bool

In [745]:
X_train_fare = train_no_na.drop(columns='Fare').values
y_train_fare = train_no_na['Fare'].values

In [752]:
gsr_fare = GridSearchCV(estimator=knr, param_grid=params, scoring='r2', cv=25)
gsr_fare.fit(X_train_fare, y_train_fare)
print(gsr_fare.best_params_)
print(gsr_fare.score(X_train_fare, y_train_fare))

{'n_neighbors': 6}
0.48209705464948893


# Random Forest Classifier (filled na's)

## One-hot classes

In [770]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

train_df_1 = pd.read_csv('train.csv')

In [771]:
X = train_no_na.values
y = train_df_1['Survived'].values

In [772]:
temp_X = np.zeros((len(X), 2))
temp_X[:, 0] = X[:, 0]
temp_X[:, 1] = X[:, 3]

In [773]:
scaler = StandardScaler().fit(temp_X)

In [774]:
temp_X = scaler.transform(temp_X)

In [775]:
scaled_X = np.copy(X)
scaled_X[:, 0] = temp_X[:, 0]
scaled_X[:, 3] = temp_X[:, 1]

In [776]:
params = [{'max_depth': [4, 6, 8], 'n_estimators': [100], 
          'min_samples_split':[6, 7, 8]}]

In [777]:
rfc = RandomForestClassifier()

gs = GridSearchCV(estimator=rfc, param_grid=params, scoring='accuracy', cv=25)
gs.fit(scaled_X, y)
print(gs.best_params_)
print(gs.best_score_)

{'max_depth': 8, 'min_samples_split': 7, 'n_estimators': 100}
0.8350793650793651


In [778]:
gs.score(scaled_X, y) #REMEMBER THAT THIS IS FOR TRAINING SET, don't overfit.

0.8978675645342312

## Regular classes

In [700]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

train_df_1 = pd.read_csv('train.csv')

In [701]:
train_reg = train_no_na.copy()
train_reg['Class'] = train_reg['first_class'] + 2*train_reg['second_class'] + 3*train_reg['third_class']
train_reg['Gender'] = train_reg['male']
train_reg = train_reg.drop(columns=['female', 'male', 'first_class', 'second_class', 'third_class'])

In [703]:
X = train_reg.values
y = train_df_1['Survived'].values

In [704]:
temp_X = np.zeros((len(X), 2))
temp_X[:, 0] = X[:, 0]
temp_X[:, 1] = X[:, 3]

In [705]:
scaler = StandardScaler().fit(temp_X)
temp_X = scaler.transform(temp_X)
scaled_X = np.copy(X)
scaled_X[:, 0] = temp_X[:, 0]
scaled_X[:, 3] = temp_X[:, 1]

In [706]:
params = [{'max_depth': [4, 6, 8], 'n_estimators': [100], 
          'min_samples_split':[6, 7, 8]}]

In [707]:
rfc = RandomForestClassifier()

gs = GridSearchCV(estimator=rfc, param_grid=params, scoring='accuracy', cv=25)
gs.fit(scaled_X, y)
print(gs.best_params_)
print(gs.best_score_)

{'max_depth': 8, 'min_samples_split': 6, 'n_estimators': 100}
0.8327936507936508


In [708]:
gs.score(scaled_X, y)

0.9023569023569024

# Random Forest Classifier (dropped na's)

In [723]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [724]:
scaler = StandardScaler().fit(X_train[:, 5].reshape((-1, 1)))

In [725]:
X[:, 5] = scaler.transform(X[:, 5].reshape((-1, 1))).reshape(len(X[:, 5]))

In [726]:
from sklearn.ensemble import RandomForestClassifier

In [727]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [728]:
params = [{'max_depth': [4, 6, 8], 'n_estimators': [100], 
          'min_samples_split':[6, 7, 8]}]

In [729]:
rfc = RandomForestClassifier()

gs = GridSearchCV(estimator=rfc, param_grid=params, scoring='accuracy', cv=25)
gs.fit(X, y)
print(gs.best_params_)
print(gs.best_score_)

{'max_depth': 4, 'min_samples_split': 6, 'n_estimators': 100}
0.8189655172413792


In [730]:
gs.score(X, y)

0.8384831460674157

# Data processing: One-hot encoding categoricals (dropped na's)

In [179]:
train_df = pd.read_csv('train.csv')
train_df = train_df.drop(columns=['Name', 'PassengerId', 'Ticket', 'Cabin'])
train_df = train_df.dropna()

In [180]:
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
889,1,1,male,26.0,0,0,30.0000,C


In [193]:
train_df['female'] = train_df['Sex'].replace(['female', 'male'], [1, 0])
train_df['male'] = train_df['Sex'].replace(['female', 'male'], [0, 1])
train_df['third_class'] = train_df['Pclass'].replace([1, 2, 3], [0, 0, 1])
train_df['second_class'] = train_df['Pclass'].replace([1, 2, 3], [0, 1, 0])
train_df['first_class'] = train_df['Pclass'].replace([1, 2, 3], [1, 0, 0])
train_df['s'] = train_df['Embarked'].replace(['S', 'C', 'Q'], [1, 0, 0])
train_df['c'] = train_df['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 0])
train_df['q'] = train_df['Embarked'].replace(['S', 'C', 'Q'], [0, 0, 1])
train_df = train_df.drop(columns=['Pclass', 'Sex', 'Embarked'])

In [194]:
train_df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,female,male,third_class,second_class,first_class,s,c,q
0,0,22.0,1,0,7.2500,0,1,1,0,0,1,0,0
1,1,38.0,1,0,71.2833,1,0,0,0,1,0,1,0
2,1,26.0,0,0,7.9250,1,0,1,0,0,1,0,0
3,1,35.0,1,0,53.1000,1,0,0,0,1,1,0,0
4,0,35.0,0,0,8.0500,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,39.0,0,5,29.1250,1,0,1,0,0,0,0,1
886,0,27.0,0,0,13.0000,0,1,0,1,0,1,0,0
887,1,19.0,0,0,30.0000,1,0,0,0,1,1,0,0
889,1,26.0,0,0,30.0000,0,1,0,0,1,0,1,0


In [195]:
X = train_df.loc[:, 'Age':'q'].values
y = train_df['Survived'].values

# SVM (dropped na's)

In [196]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [197]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=692)

In [208]:
X_scale = np.vstack((X_train[:, 0], X_train[:, 3])).T
X_scale_test = np.vstack((X_test[:, 0], X_test[:, 3])).T

In [209]:
scaler = StandardScaler().fit(X_scale)

In [210]:
X_train[:, 0] = scaler.transform(X_scale)[:, 0]
X_train[:, 3] = scaler.transform(X_scale)[:, 1]
X_test[:, 0] = scaler.transform(X_scale_test)[:, 0]
X_test[:, 3] = scaler.transform(X_scale_test)[:, 1]

In [261]:
params = np.linspace(0.01, 0.5, 1000)
max = 0.0

for i in params:
    svm = SVC(C=i)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    #print(acc)
    
    if acc > max: 
        max = acc
        best = i
        
        
print(max, best)

0.875 0.06346346346346346


In [265]:
svm = SVC(C=0.06346346346346346)
svm.fit(X_train, y_train)
accuracy_score(y_train, svm.predict(X_train))

0.8078125

In [254]:
params = np.linspace(0.0001, 0.5, 1000)
max = 0.0

for i in params:
    svm = SVC(C=i, kernel='linear')
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    #print(acc)
    
    if acc > max: 
        max = acc
        best = i
        
        
print(max, best)

0.8611111111111112 0.005604404404404405


In [260]:
params = [np.linspace(0.0001, 0.3, 100), [2, 3, 4, 5, 6, 7, 8]]
max = 0.0

for i in params[0]:
    for j in params[1]:
        svm = SVC(C=i, kernel='poly', degree=j)
        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        #print(acc)

        if acc > max: 
            max = acc
            best = [i, j]
        
        
print(max, best)

0.8194444444444444 [0.1576232323232323, 2]


# Testing RFC

In [793]:
test_df = pd.read_csv('test.csv')
test_df_1 = pd.read_csv('test.csv')

In [732]:
test_df = test_df.drop(columns=['Name', 'PassengerId', 'Ticket', 'Cabin'])
test_df['female'] = test_df['Sex'].replace(['female', 'male'], [1, 0])
test_df['male'] = test_df['Sex'].replace(['female', 'male'], [0, 1])
test_df['third_class'] = test_df['Pclass'].replace([1, 2, 3], [0, 0, 1])
test_df['second_class'] = test_df['Pclass'].replace([1, 2, 3], [0, 1, 0])
test_df['first_class'] = test_df['Pclass'].replace([1, 2, 3], [1, 0, 0])
test_df = test_df.drop(columns=['Pclass', 'Sex'])
test_df.loc[:, 'Embarked'] = test_df['Embarked'].replace(['S', 'C', 'Q'], [int(0), int(1), int(2)])

In [735]:
test_df.isna().sum()

Age             86
SibSp            0
Parch            0
Fare             1
Embarked         0
female           0
male             0
third_class      0
second_class     0
first_class      0
dtype: int64

In [761]:
missing_fare = test_df[test_df['Fare'].isna()].drop(columns='Fare')
missing_age = test_df[test_df['Age'].isna()].drop(columns=['Embarked', 'Age'])

In [765]:
sub_vals_fare = pd.Series(gsr_fare.predict(missing_fare.values), index=missing_fare.index)
filled_fare_col = test_df['Fare'].fillna(sub_vals_fare)
sub_vals_age = pd.Series(gsr.predict(missing_age.values), index=missing_age.index)
filled_age_col = test_df['Age'].fillna(sub_vals_age)

In [767]:
test_no_na = test_df.copy()
test_no_na['Age'] = filled_age_col
test_no_na['Fare'] = filled_fare_col

In [769]:
test_no_na.isna().any()

Age             False
SibSp           False
Parch           False
Fare            False
Embarked        False
female          False
male            False
third_class     False
second_class    False
first_class     False
dtype: bool

In [780]:
test_no_na.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Embarked,female,male,third_class,second_class,first_class
0,34.5,0,0,7.8292,2,0,1,1,0,0
1,47.0,1,0,7.0,0,1,0,1,0,0
2,62.0,0,0,9.6875,2,0,1,0,1,0
3,27.0,0,0,8.6625,0,0,1,1,0,0
4,22.0,1,1,12.2875,0,1,0,1,0,0


In [781]:
train_no_na.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Embarked,female,male,third_class,second_class,first_class
0,22.0,1,0,7.25,0.0,0,1,1,0,0
1,38.0,1,0,71.2833,1.0,1,0,0,0,1
2,26.0,0,0,7.925,0.0,1,0,1,0,0
3,35.0,1,0,53.1,0.0,1,0,0,0,1
4,35.0,0,0,8.05,0.0,0,1,1,0,0


In [802]:
final = test_df_1.copy()
final['Survived'] = pd.Series(gs.predict(test_no_na.values), index=test_no_na.index)
final = final.drop(columns=final.columns.difference(['PassengerId', 'Survived']))

In [803]:
final

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [805]:
final.to_csv('output.csv', index=False)