In [57]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from env import host, username, password
from pydataset import data
import os
from env import get_db_url
from acquire import get_titanic_data
from prepare import prep_titanic, titanic_split, impute_mean_age
import scipy

In [76]:
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [77]:
df

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [78]:

dummy1 = pd.get_dummies(df[['embark_town']],drop_first=True)
dummy2 = pd.get_dummies(df[['class']],drop_first=True)
df["is_female"] = (df.sex == "female")

In [79]:
df = pd.concat([df, dummy1, dummy2], axis=1)

In [80]:
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,is_female,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,False,0,1,0,1
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,True,0,0,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,True,0,1,0,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,True,0,1,0,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,False,0,1,0,1


In [81]:
df = df.drop(columns=['Unnamed: 0','passenger_id','age','sex', 'pclass','class', 'embarked','embark_town','deck','sibsp','parch'])

In [82]:
df.head()

Unnamed: 0,survived,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third
0,0,7.25,0,False,0,1,0,1
1,1,71.2833,0,True,0,0,0,0
2,1,7.925,1,True,0,1,0,1
3,1,53.1,0,True,0,1,0,0
4,0,8.05,1,False,0,1,0,1


In [87]:

df.dropna(inplace = True)
df.head()

Unnamed: 0,survived,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third
0,0,7.25,0,False,0,1,0,1
1,1,71.2833,0,True,0,0,0,0
2,1,7.925,1,True,0,1,0,1
3,1,53.1,0,True,0,1,0,0
4,0,8.05,1,False,0,1,0,1


In [88]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [89]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [90]:
#sanity check
train.shape, validate.shape, test.shape

((498, 8), (214, 8), (179, 8))

In [91]:
train['baseline_prediction'] = 0 #because I'm predicting more people did not survive
accuracy_score(train.survived, train.baseline_prediction)

0.6164658634538153

In [92]:
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)

print(f'training score: {model1.score(X_train, y_train):.2%}')
print(f'validate score: {model1.score(X_validate, y_validate):.2%}')

training score: 93.78%
validate score: 76.64%


In [95]:
train['model1'] = model1.predict(X_train)
train.head()

Unnamed: 0,survived,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third,baseline_prediction,model1
583,0,40.125,1,False,0,0,0,0,0,0
165,1,20.525,0,False,0,1,0,1,0,0
50,0,39.6875,0,False,0,1,0,1,0,0
259,1,26.0,0,True,0,1,1,0,0,1
306,1,110.8833,1,True,0,0,0,0,0,1


In [96]:
y_pred = model1.predict(X_train)

In [97]:
#using model score
model1.score(X_validate,y_validate)

0.7663551401869159

In [98]:
confusion_matrix(y_train, y_pred)

array([[305,   2],
       [ 29, 162]])

In [99]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [103]:
#confusion matrix
pd.crosstab(y_train, y_pred, normalize = True)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.61245,0.004016
1,0.058233,0.325301


In [101]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       307
           1       0.99      0.85      0.91       191

    accuracy                           0.94       498
   macro avg       0.95      0.92      0.93       498
weighted avg       0.94      0.94      0.94       498

