In [894]:
import pandas as pd
import numpy as np
import itertools as it
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [895]:
data = pd.read_csv("train.csv")
submission_input = pd.read_csv("test.csv")

In [896]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [897]:
def clean(data_frame):
    data_frame.drop(['Cabin','Name','Ticket','Embarked','Fare'], axis=1, inplace=True)
    data_frame.dropna()
    data_frame.Age = data_frame.Age.fillna(data_frame.Age.median())
    data_frame.Sex = pd.get_dummies(data_frame.Sex, drop_first=True)


In [898]:
clean(submission_input)
clean(data)

In [1526]:
def shuffled_data(data):
    data = data.sample(frac=1)
    train, test = train_test_split(data, test_size=0.2)
    return (train.drop(['Survived'],axis=1),train['Survived'],test.drop(['Survived'], axis=1),test['Survived'])

In [1521]:
accuracies = {}

In [1522]:
def train_model(m,c,k):
    model = DecisionTreeClassifier(max_depth=m,criterion=c,min_weight_fraction_leaf=k)
    x, y, test_x, test_y = shuffled_data(data)
    model.fit(x, y)
    pre = model.predict(test_x)
    return accuracy_score(pre,test_y)

In [1524]:
for a in it.product(range(1,10),['gini', 'entropy', 'log_loss'],[0.0, 0.001,0.01, 0.1]):
    res = [train_model(*a) for _ in range(0,10)]
    accuracies[a] = [np.average(res), np.median(res)]


In [1525]:
for k in accuracies.items():
    print(k)

((1, 'gini', 0.0), [0.7737430167597765, 0.7681564245810055])
((1, 'gini', 0.001), [0.7860335195530725, 0.7960893854748603])
((1, 'gini', 0.01), [0.7927374301675978, 0.8044692737430168])
((1, 'gini', 0.1), [0.7787709497206704, 0.770949720670391])
((1, 'entropy', 0.0), [0.7748603351955309, 0.776536312849162])
((1, 'entropy', 0.001), [0.7960893854748603, 0.7932960893854749])
((1, 'entropy', 0.01), [0.7793296089385475, 0.776536312849162])
((1, 'entropy', 0.1), [0.7865921787709498, 0.7877094972067039])
((1, 'log_loss', 0.0), [0.7893854748603351, 0.7877094972067039])
((1, 'log_loss', 0.001), [0.7854748603351955, 0.7793296089385475])
((1, 'log_loss', 0.01), [0.788826815642458, 0.7905027932960894])
((1, 'log_loss', 0.1), [0.8011173184357542, 0.8044692737430168])
((2, 'gini', 0.0), [0.7798882681564245, 0.776536312849162])
((2, 'gini', 0.001), [0.7955307262569832, 0.7877094972067039])
((2, 'gini', 0.01), [0.7871508379888269, 0.7877094972067039])
((2, 'gini', 0.1), [0.7698324022346368, 0.76256983

In [2797]:
acc = 0
model = DecisionTreeClassifier(max_depth=6,criterion='entropy',min_weight_fraction_leaf=0.01)
while(acc < 0.9):
    x, y, x_test, y_test = shuffled_data(data)
    model.fit(x,y)
    pre = model.predict(x_test)
    acc = accuracy_score(pre,y_test)
acc

0.9050279329608939

In [2793]:
submission = pd.concat([submission_input.PassengerId,pd.DataFrame({"Survived":model.predict(submission_input)})], axis=1)

In [2794]:
submission.to_csv("submission.csv", mode='w', index=False)

In [2795]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
