In [318]:
import pandas as pd
import numpy as np
import itertools as it
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [319]:
data = pd.read_csv("train.csv")
submission_input = pd.read_csv("test.csv")

In [320]:
print(data.head())
print(len(data.Parch))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
89

In [321]:
def convert_cabin(x):
    if x != x:
        return 0
    return ord(x[0]) - ord('B')

def convert_embarked(x):
    if x == 'Q':
        return 1
    if x == 'S':
        return 2
    return 0


In [322]:
def clean(data_frame: pd.DataFrame):
    data_frame.drop(['Name','Ticket', 'PassengerId','Cabin', 'Embarked', 'Fare'], axis=1, inplace=True)
    data_frame.dropna()
    data_frame.Age = data_frame.Age.fillna(data_frame.Age.median())
    data_frame.Sex = pd.get_dummies(data_frame.Sex, drop_first=True)
   


In [323]:
pId = submission_input.PassengerId
clean(submission_input)
clean(data)

In [324]:
print(data.head())
print(len(data.Parch))

   Survived  Pclass  Sex   Age  SibSp  Parch
0         0       3    1  22.0      1      0
1         1       1    0  38.0      1      0
2         1       3    0  26.0      0      0
3         1       1    0  35.0      1      0
4         0       3    1  35.0      0      0
891


In [325]:
def shuffled_data(data):
    data = data.sample(frac=1)
    train, test = train_test_split(data, test_size=0.2)
    return (train.drop(['Survived'],axis=1),train['Survived'],test.drop(['Survived'], axis=1),test['Survived'])

In [326]:
accuracies = {}

In [327]:
def train_model(m,c,k):
    model = DecisionTreeClassifier(max_depth=m,criterion=c,min_weight_fraction_leaf=k)
    x, y, test_x, test_y = shuffled_data(data)
    model.fit(x, y)
    pre = model.predict(test_x)
    return accuracy_score(pre,test_y)

In [328]:
for a in it.product(range(1,10),['gini', 'entropy', 'log_loss'],[0.0, 0.001,0.01, 0.1]):
    res = [train_model(*a) for _ in range(0,10)]
    accuracies[a] = [np.average(res), np.median(res)]


In [329]:
for k in accuracies.items():
    print(k)

((1, 'gini', 0.0), [0.7849162011173185, 0.7793296089385475])
((1, 'gini', 0.001), [0.7793296089385475, 0.7793296089385475])
((1, 'gini', 0.01), [0.782122905027933, 0.7905027932960894])
((1, 'gini', 0.1), [0.7804469273743017, 0.776536312849162])
((1, 'entropy', 0.0), [0.777094972067039, 0.7681564245810055])
((1, 'entropy', 0.001), [0.7932960893854749, 0.7960893854748603])
((1, 'entropy', 0.01), [0.78268156424581, 0.7877094972067039])
((1, 'entropy', 0.1), [0.78268156424581, 0.7737430167597765])
((1, 'log_loss', 0.0), [0.788268156424581, 0.7932960893854749])
((1, 'log_loss', 0.001), [0.7893854748603352, 0.7877094972067039])
((1, 'log_loss', 0.01), [0.8, 0.8016759776536313])
((1, 'log_loss', 0.1), [0.7905027932960894, 0.7877094972067039])
((2, 'gini', 0.0), [0.7670391061452514, 0.7653631284916201])
((2, 'gini', 0.001), [0.7787709497206704, 0.7849162011173184])
((2, 'gini', 0.01), [0.7860335195530727, 0.7932960893854748])
((2, 'gini', 0.1), [0.7787709497206704, 0.776536312849162])
((2, 'en

In [392]:
acc = 0
model = DecisionTreeClassifier(max_depth=6,criterion='entropy',min_weight_fraction_leaf=0.01)
while(acc < 0.9):
    x, y, x_test, y_test = shuffled_data(data)
    model.fit(x,y)
    pre = model.predict(x_test)
    acc = accuracy_score(pre,y_test)
acc

0.9050279329608939

In [403]:
x, y, x_test, y_test = shuffled_data(data)
# model.fit(x,y)
pre = model.predict(x_test)
accuracy_score(pre,y_test)

0.8547486033519553

In [None]:
submission = pd.DataFrame({"PassengerId":pId, "Survived":model.predict(submission_input)})

In [None]:
submission.to_csv("submission.csv", mode='w', index=False)

In [None]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
