In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [97]:
trainfile = '../data/processed/train_1.0.csv'
testfile = '../data/processed/test_1.0.csv'

In [98]:
df_train = pd.read_csv(trainfile)
df_test = pd.read_csv(testfile)

In [99]:
df_train.drop('Unnamed: 0', axis=1,inplace=True)

In [100]:
df_train.sort_values(['Pclass','Name']).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Female,FamilySize,Agem_0-16,Agem_16-25,Agem_25-40,Agem_40-60,Agem_60+,from_C
730,731,1,1,"Allen, Miss. Elisabeth Walton",1,1,0,0,1,0,0,0
305,306,1,1,"Allison, Master. Hudson Trevor",0,4,1,0,0,0,0,0
297,298,0,1,"Allison, Miss. Helen Loraine",1,4,1,0,0,0,0,0
498,499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,4,0,1,0,0,0,0
460,461,1,1,"Anderson, Mr. Harry",0,1,0,0,0,1,0,0


In [101]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [102]:
X_train = df_train[['Pclass','Female',"FamilySize",
                          'Agem_0-16',
                          'Agem_16-25',
                          'Agem_25-40',
                          'Agem_40-60'
                          ]]
Y_train = df_train["Survived"]
X_test  = df_test[['Pclass','Female',"FamilySize",
                          'Agem_0-16',
                          'Agem_16-25',
                          'Agem_25-40',
                          'Agem_40-60'
                          ]].copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 7), (891,), (418, 7))

In [103]:
logreg = LogisticRegression()
logreg.fit(X_train,Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

80.25

In [104]:
pd.Series(logreg.coef_[0])

0   -1.052133
1    2.731275
2   -0.259295
3    1.919548
4    0.416715
5    0.500682
6   -0.014193
dtype: float64

In [105]:
coef = pd.DataFrame(['Pclass','Female',"FamilySize",
                          'Agem_0-16',
                          'Agem_16-25',
                          'Agem_25-40',
                          'Agem_40-60',
                          'from_C'])
coef.columns = ['Feature']
coef["Correlation"] = pd.Series(logreg.coef_[0])

coef.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,Female,2.731275
3,Agem_0-16,1.919548
5,Agem_25-40,0.500682
4,Agem_16-25,0.416715
6,Agem_40-60,-0.014193
2,FamilySize,-0.259295
0,Pclass,-1.052133
7,from_C,


In [106]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

83.049999999999997

In [107]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

82.489999999999995

In [108]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

84.060000000000002

In [109]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

84.060000000000002

In [110]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,84.06
4,Decision Tree,84.06
0,Support Vector Machines,83.05
1,KNN,82.49
2,Logistic Regression,80.25


In [111]:
submission = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('../data/processed/submission.csv', index=False)