### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as warn
import sys
import env
import numpy as np
import graphviz

sys.path.append(env.util_repo)

from acquire import get_titanic_data
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from graphviz import Graph

warn.filterwarnings("ignore")

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [34]:
titanic_df = get_titanic_data()
titanic_df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [35]:
titanic_df.shape

(891, 14)

In [36]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    891 non-null    int64  
 1   passenger_id  891 non-null    int64  
 2   survived      891 non-null    int64  
 3   pclass        891 non-null    int64  
 4   sex           891 non-null    object 
 5   age           714 non-null    float64
 6   sibsp         891 non-null    int64  
 7   parch         891 non-null    int64  
 8   fare          891 non-null    float64
 9   embarked      889 non-null    object 
 10  class         891 non-null    object 
 11  deck          203 non-null    object 
 12  embark_town   889 non-null    object 
 13  alone         891 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 97.6+ KB


In [37]:
# Drop these columns due to redundancy, missing values, irrelevant
titanic_df.drop(columns=['Unnamed: 0', 'passenger_id', 'age', 'sibsp', 'parch', 'deck', 'embarked', 'class'], inplace=True)
titanic_df.head()

Unnamed: 0,survived,pclass,sex,fare,embark_town,alone
0,0,3,male,7.25,Southampton,0
1,1,1,female,71.2833,Cherbourg,0
2,1,3,female,7.925,Southampton,1
3,1,1,female,53.1,Southampton,0
4,0,3,male,8.05,Southampton,1


In [38]:
titanic_df.embark_town.value_counts()
titanic_df.fillna(value="Southampton", inplace=True)
titanic_df.embark_town.value_counts()

Southampton    646
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [39]:
sex_dummies = pd.get_dummies(titanic_df.sex, dummy_na=False, drop_first=False)
sex_dummies.head()

embark_dummies = pd.get_dummies(titanic_df.embark_town, dummy_na=False, drop_first=False)
embark_dummies.head()

titanic_df = pd.concat([titanic_df, sex_dummies, embark_dummies], axis=1)
titanic_df.head()

titanic_df.drop(columns=['sex', 'embark_town'], inplace=True)
titanic_df.head()

Unnamed: 0,survived,pclass,fare,alone,female,male,Cherbourg,Queenstown,Southampton
0,0,3,7.25,0,0,1,0,0,1
1,1,1,71.2833,0,1,0,1,0,0
2,1,3,7.925,1,1,0,0,0,1
3,1,1,53.1,0,1,0,0,0,1
4,0,3,8.05,1,0,1,0,0,1


In [89]:
# Split the data
train, test = train_test_split(titanic_df, test_size=.2, random_state=1414, stratify=titanic_df['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=1414, stratify=train['survived'])

In [90]:
train.survived.mean()

0.38353413654618473

In [91]:
# Baseline prediction is that passenger did NOT survive (survived = 0)
train['baseline_survived'] = 0
train.head()

Unnamed: 0,survived,pclass,fare,alone,female,male,Cherbourg,Queenstown,Southampton,baseline_survived
824,0,3,39.6875,0,0,1,0,0,1,0
822,0,1,0.0,1,0,1,0,0,1,0
149,0,2,13.0,1,0,1,0,0,1,0
752,0,3,9.5,1,0,1,0,0,1,0
94,0,3,7.25,1,0,1,0,0,1,0


In [92]:
baseline_accuracy = (train.survived == train.baseline_survived).mean()
print(f"baseline accuracy:  {baseline_accuracy:.2%}")

baseline accuracy:  61.65%


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [110]:
# Prepare inputs (X) and targets (y)
X_train = train.drop(columns=['survived', 'baseline_survived'])
y_train = train.survived

X_validate = validate.drop(columns='survived')
y_validate = validate.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [113]:
# Construct classification tree object
clf = DecisionTreeClassifier(max_depth=3, random_state=1414)

In [114]:
# Fit the model to the training data
clf = clf.fit(X_train, y_train)

In [115]:
# Graphviz visualization
dot_data = export_graphviz(clf, feature_names=X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [116]:
# Predictions
y_pred = clf.predict(X_train)
y_pred[0:14]

array([0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0])

In [117]:
# Probability of outcome
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:14]

array([[0.88546256, 0.11453744],
       [1.        , 0.        ],
       [0.88546256, 0.11453744],
       [0.88546256, 0.11453744],
       [0.88546256, 0.11453744],
       [0.31818182, 0.68181818],
       [0.31818182, 0.68181818],
       [0.62686567, 0.37313433],
       [0.31818182, 0.68181818],
       [0.        , 1.        ],
       [0.73076923, 0.26923077],
       [0.31818182, 0.68181818],
       [0.88546256, 0.11453744],
       [0.62686567, 0.37313433]])

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [118]:
# Accuracy
print(f"Accuracy for training sample:  {clf.score(X_train, y_train):.2%}")

Accuracy for training sample:  83.73%


In [119]:
# Confusion matrix
pd.DataFrame(confusion_matrix(y_train, y_pred, labels=(0, 1)), index=['acutal died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
acutal died,285,22
actual survived,59,132


In [120]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       307
           1       0.86      0.69      0.77       191

    accuracy                           0.84       498
   macro avg       0.84      0.81      0.82       498
weighted avg       0.84      0.84      0.83       498



### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [122]:
# Validate set accuracy
print(f"Accuracy for validate sample:  {clf.score(X_validate, y_validate):.2%}")

Accuracy for validate sample:  77.10%


In [123]:
# Predict outcomes of validate sample with model
y_validate_pred = clf.predict(X_validate)

In [127]:
# Confusion matrix
pd.DataFrame(confusion_matrix(y_validate, y_validate_pred, labels=(0, 1)), index=['acutal died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
acutal died,110,22
actual survived,27,55


In [124]:
print(classification_report(y_validate, y_validate_pred))

              precision    recall  f1-score   support

           0       0.80      0.83      0.82       132
           1       0.71      0.67      0.69        82

    accuracy                           0.77       214
   macro avg       0.76      0.75      0.75       214
weighted avg       0.77      0.77      0.77       214



### 5. Run through steps 2-4 using a different max_depth value.

In [128]:
# Construct classification tree object
clf = DecisionTreeClassifier(max_depth=7, random_state=1414)

In [129]:
# Fit the model to the training data
clf = clf.fit(X_train, y_train)

In [130]:
# Graphviz visualization
dot_data = export_graphviz(clf, feature_names=X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [131]:
# Predictions
y_pred = clf.predict(X_train)
y_pred[0:14]

array([0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0])

In [132]:
# Accuracy
print(f"Accuracy for training sample:  {clf.score(X_train, y_train):.2%}")

Accuracy for training sample:  89.16%


In [133]:
# Confusion matrix
pd.DataFrame(confusion_matrix(y_train, y_pred, labels=(0, 1)), index=['acutal died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
acutal died,284,23
actual survived,31,160


In [134]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.93      0.91       307
           1       0.87      0.84      0.86       191

    accuracy                           0.89       498
   macro avg       0.89      0.88      0.88       498
weighted avg       0.89      0.89      0.89       498



In [135]:
# Validate set accuracy
print(f"Accuracy for validate sample:  {clf.score(X_validate, y_validate):.2%}")

Accuracy for validate sample:  73.83%


In [136]:
# Predict outcomes of validate sample with model
y_validate_pred = clf.predict(X_validate)

In [137]:
# Confusion matrix
pd.DataFrame(confusion_matrix(y_validate, y_validate_pred, labels=(0, 1)), index=['acutal died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
acutal died,100,32
actual survived,24,58


In [138]:
print(classification_report(y_validate, y_validate_pred))

              precision    recall  f1-score   support

           0       0.81      0.76      0.78       132
           1       0.64      0.71      0.67        82

    accuracy                           0.74       214
   macro avg       0.73      0.73      0.73       214
weighted avg       0.74      0.74      0.74       214



### 6. Which model performs better on your in-sample data?

<p>The model with a higher max_depth value performs better on training data

### 7. Which model performs best on your out-of-sample data, the validate set?

<p>The model with max_depth=3 performs better on the validate set.