In [674]:
import pandas as pd
import numpy as np
import random as rd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [675]:
data = pd.read_csv("train.csv")
submission_input = pd.read_csv("test.csv")

In [676]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [677]:
def clean(data):
    data.drop(['Cabin','Name','Ticket','Embarked','Fare'], axis=1, inplace=True)
    data.dropna()
    data.Age = data.Age.fillna(data.Age.median())
    data.Sex = pd.get_dummies(data.Sex, drop_first=True)


In [678]:
clean(submission_input)
clean(data)

In [679]:
train, test = train_test_split(data, test_size=0.2)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,1,22.0,1,0
1,2,1,1,0,38.0,1,0
2,3,1,3,0,26.0,0,0
3,4,1,1,0,35.0,1,0
4,5,0,3,1,35.0,0,0


In [680]:
accuracies = {}
models = []
y = train['Survived']
test_y = test['Survived']
x = train.drop(['Survived'],axis=1)
test_x = test.drop(['Survived'], axis=1)

In [681]:
def train_model(m,c,k):
    model = DecisionTreeClassifier(max_depth=m,criterion=c,min_weight_fraction_leaf=k, splitter="random")
    model.fit(x, y)
    pre = model.predict(test_x)
    res = accuracy_score(pre,test_y)
    accuracies[f"{m}, {c}, {k}"] = res
    return res

In [682]:
for i in range(1,10):
    for j in ['gini', 'entropy', 'log_loss']:
        for k in [0,0.001,0.01,0.02,0.03,0.1,0.5]:
            res = []
            for n in range(0,10):
                res.append(train_model(i,j,k))
            print(np.mean(res), np.median(res))

0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.5865921787709497 0.5865921787709497
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.5865921787709497 0.5865921787709497
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.7821229050279329 0.7821229050279329
0.5865921787709497 0.5865921787709497
0.7642458100558658 0.7597765363128491
0.7687150837988825 0.7597765363128491
0.770949720670391 0.770949720670391
0.7798882681564245 0.7821229050279329
0.7687150837988825 0.7597765363128491
0.773184357541

In [683]:
for k in accuracies.items():
    print(k)

('1, gini, 0', 0.7821229050279329)
('1, gini, 0.001', 0.7821229050279329)
('1, gini, 0.01', 0.7821229050279329)
('1, gini, 0.02', 0.7821229050279329)
('1, gini, 0.03', 0.7821229050279329)
('1, gini, 0.1', 0.7821229050279329)
('1, gini, 0.5', 0.5865921787709497)
('1, entropy, 0', 0.7821229050279329)
('1, entropy, 0.001', 0.7821229050279329)
('1, entropy, 0.01', 0.7821229050279329)
('1, entropy, 0.02', 0.7821229050279329)
('1, entropy, 0.03', 0.7821229050279329)
('1, entropy, 0.1', 0.7821229050279329)
('1, entropy, 0.5', 0.5865921787709497)
('1, log_loss, 0', 0.7821229050279329)
('1, log_loss, 0.001', 0.7821229050279329)
('1, log_loss, 0.01', 0.7821229050279329)
('1, log_loss, 0.02', 0.7821229050279329)
('1, log_loss, 0.03', 0.7821229050279329)
('1, log_loss, 0.1', 0.7821229050279329)
('1, log_loss, 0.5', 0.5865921787709497)
('2, gini, 0', 0.7597765363128491)
('2, gini, 0.001', 0.7821229050279329)
('2, gini, 0.01', 0.7597765363128491)
('2, gini, 0.02', 0.7597765363128491)
('2, gini, 0.03

In [684]:
arr=[(a, b) for (a,b) in zip(np.asfarray(x),y)]
rd.shuffle(arr)
arr = np.transpose(arr)

  result = getattr(asarray(obj), method)(*args, **kwds)


In [685]:
x_test = np.transpose(np.reshape(np.concatenate(arr[0]), (len(arr[0]),6)))
x_test = pd.DataFrame(dict(zip(x.columns, x_test)))


In [686]:
y_test = pd.Series(arr[1],name="Survived", dtype=np.int64)

In [687]:
model = DecisionTreeClassifier(max_depth=6,criterion='entropy',min_weight_fraction_leaf=0.01)
model.fit(x,y)
pre = model.predict(x_test)
accuracy_score(pre,y_test)

0.8581460674157303

In [688]:

submission = pd.concat([submission_input.PassengerId,pd.DataFrame({"Survived":model.predict(submission_input)})], axis=1)

In [689]:
submission.to_csv("submission.csv", mode='w', index=False)

In [690]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
