In [1]:
import numpy as np
import os
import pandas as pd
from sklearn import tree, model_selection

In [2]:
#open dataset
df_train = pd.read_csv('datasets/titanic/train.csv')
df_test = pd.read_csv('datasets/titanic/test.csv')

In [3]:
def cleanData(data):
    data["Fare"] = data["Fare"].fillna(data["Fare"].dropna().median())
    data["Age"] = data["Age"].fillna(data["Age"].dropna().median())

    data.loc[data["Sex"] == "male", "Sex"] = 0
    data.loc[data["Sex"] == "female", "Sex"] = 1

def writePrediction(prediction, name):
    PassengerId = np.array(df_test["PassengerId"]).astype(int)
    solution = pd.DataFrame(prediction, PassengerId, columns = ["Survived"])
    solution.to_csv(name, index_label = ["PassengerId"])

In [4]:
#clean data (assigning genger to int)
cleanData(df_train)
cleanData(df_test)

In [5]:
goal = df_train["Survived"].values
factors = df_train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch"]].values

In [6]:
decision_tree = tree.DecisionTreeClassifier(random_state = 1)
decision_tree = decision_tree.fit(factors, goal)

print(decision_tree.feature_importances_)
print(decision_tree.score(factors, goal))

[0.10931463 0.23630162 0.31088095 0.27638921 0.04249874 0.02461487]
0.9797979797979798


In [7]:
test_factors = df_test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch"]].values
prediction = decision_tree.predict(test_factors)
writePrediction(prediction, "results/decision_tree.csv")

In [8]:
#fix overfitting
factor_names = ["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch"]
factors_two = df_train[factor_names].values
decision_tree_two = tree.DecisionTreeClassifier(
    max_depth = 7,
    min_samples_split = 2,
    random_state = 1)
decision_tree_two = decision_tree_two.fit(factors_two, goal)

print(decision_tree_two.feature_importances_)
print(decision_tree_two.score(factors_two, goal))

[0.16641327 0.14839983 0.47879072 0.14075294 0.05194614 0.01369711]
0.8787878787878788


In [9]:
scores = model_selection.cross_val_score(decision_tree_two, factors_two, goal, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

[0.74444444 0.81111111 0.75280899 0.86516854 0.87640449 0.80898876
 0.83146067 0.75280899 0.83146067 0.86363636]
0.8138293042787426


In [10]:
test_factors_two = df_test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch"]].values
prediction_two = decision_tree_two.predict(test_factors_two)
writePrediction(prediction_two, "results/decision_tree_two.csv")