In [71]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd 

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [72]:
import acquire

### Decicion Tree with the Titanic Data

In [73]:
# Aquire: 

In [74]:
titanic_df = acquire.get_titanic_data()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [75]:
# Preoare: 

In [76]:
def prep_titanic(titanic_df):
    '''
    This function will clean the titanic data...
    '''
    titanic_df = titanic_df.drop_duplicates()
    cols_to_drop = ['deck', 'embarked', 'class', 'age']
    titanic_df = titanic_df.drop(columns=cols_to_drop)
    titanic_df.embark_town = titanic_df.embark_town.fillna(value=titanic_df.embark_town.mode())
    dummy_df = pd.get_dummies(titanic_df[['sex', 'embark_town']], dummy_na=False, drop_first=[True, True])
    titanic_df = pd.concat([titanic_df, dummy_df], axis=1)
    return titanic_df

In [77]:
titanic_df = prep_titanic(titanic_df)
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,male,1,0,7.25,Southampton,0,1,0,1
1,1,1,1,female,1,0,71.2833,Cherbourg,0,0,0,0
2,2,1,3,female,0,0,7.925,Southampton,1,0,0,1
3,3,1,1,female,1,0,53.1,Southampton,0,0,0,1
4,4,0,3,male,0,0,8.05,Southampton,1,1,0,1


In [78]:
titanic_df = titanic_df.drop(columns=['sex', 'embark_town'])

In [79]:
titanic_df.shape

(891, 10)

In [80]:
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,1,0,71.2833,0,0,0,0
2,2,1,3,0,0,7.925,1,0,0,1
3,3,1,1,1,0,53.1,0,0,0,1
4,4,0,3,0,0,8.05,1,1,0,1


In [81]:
# Split:

In [82]:
def split_data(titanic_df):
    '''
    Takes in a dataframe and return train, validate, test subset dataframes
    '''
    train, test = train_test_split(titanic_df, test_size = .2, random_state=123, stratify=titanic_df.survived)
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)
    return train, validate, test

In [83]:
train, validate, test = split_data(titanic_df)

In [84]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,0,1,0,0,40.125,1,1,0,0
165,165,1,3,0,2,20.525,0,1,0,1
50,50,0,3,4,1,39.6875,0,1,0,1
259,259,1,2,0,1,26.0,0,0,0,1
306,306,1,1,0,0,110.8833,1,0,0,0


In [85]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [86]:
train.shape

(498, 10)

In [87]:
validate.shape

(214, 10)

In [88]:
test.shape

(179, 10)

In [89]:
X_train.shape

(498, 9)

1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [90]:
# baseline prediction = most common value

In [91]:
baseline = y_train.mode()

In [92]:
baseline

0    0
dtype: int64

In [93]:
match_bsl_prediction = y_train == 0

In [94]:
baseline_accuracy = match_bsl_prediction.mean()

In [95]:
baseline_accuracy

0.6164658634538153

In [96]:
# basline accuracy = 62%

2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)


In [97]:
tree1_clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [98]:
tree1_clf = tree1_clf.fit(X_train, y_train)

In [100]:
#import graphviz
#from graphviz import Graph

#dot_data = export_graphviz(tree1_clf, feature_names= X_train.columns, class_names=tree1_clf.classes_, rounded=True, filled=True, out_file=None)
#graph = graphviz.Source(dot_data) 

#graph.render('titanic_decision_tree', view=True)

In [101]:
y_pred = tree1_clf.predict(X_train)
y_pred[0:5]

array([0, 0, 0, 1, 1])

In [104]:
y_pred_proba = tree1_clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.62222222, 0.37777778],
       [0.62222222, 0.37777778],
       [0.89285714, 0.10714286],
       [0.14814815, 0.85185185],
       [0.        , 1.        ]])

3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.


In [108]:
# accuracy:

In [105]:
print('Accuracy of Decision Tree 1 classifier on training set: {:.2f}'
      .format(tree1_clf.score(X_train, y_train)))

Accuracy of Decision Tree 1 classifier on training set: 0.82


In [106]:
# confusion matrix:

In [107]:
confusion_matrix(y_train, y_pred)

array([[274,  33],
       [ 56, 135]])

In [110]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [111]:
labels = sorted(y_train.unique())
print('Actual on the left, predicted on the top')
pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Actual on the left, predicted on the top


Unnamed: 0,0,1
0,274,33
1,56,135


In [109]:
# classification report: 

In [116]:
#print(classification_report(y_train, y_pred))

In [117]:
class_report = classification_report(y_train, y_pred, output_dict=(True))
print("Tree1 depth")
pd.DataFrame(class_report)

Tree1 depth


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.830303,0.803571,0.821285,0.816937,0.820051
recall,0.892508,0.706806,0.821285,0.799657,0.821285
f1-score,0.860283,0.752089,0.821285,0.806186,0.818787
support,307.0,191.0,0.821285,498.0,498.0


4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [118]:
class_report = classification_report(y_train, y_pred, output_dict=(True))
print("Tree1 depth")
pd.DataFrame(class_report)

Tree1 depth


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.830303,0.803571,0.821285,0.816937,0.820051
recall,0.892508,0.706806,0.821285,0.799657,0.821285
f1-score,0.860283,0.752089,0.821285,0.806186,0.818787
support,307.0,191.0,0.821285,498.0,498.0


5. Run through steps 2-4 using a different max_depth value.



In [122]:
for i in range(4, 11):
    tree =  DecisionTreeClassifier(max_depth=3, random_state=123)
    tree1_clf.fit(X_train, y_train)
    y_pred = tree1_clf.predict(X_train)
    class_report = classification_report(y_train, y_pred, output_dict=(True))
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(class_report))
    print()

Tree with max depth of 4
                    0           1  accuracy   macro avg  weighted avg
precision    0.830303    0.803571  0.821285    0.816937      0.820051
recall       0.892508    0.706806  0.821285    0.799657      0.821285
f1-score     0.860283    0.752089  0.821285    0.806186      0.818787
support    307.000000  191.000000  0.821285  498.000000    498.000000

Tree with max depth of 5
                    0           1  accuracy   macro avg  weighted avg
precision    0.830303    0.803571  0.821285    0.816937      0.820051
recall       0.892508    0.706806  0.821285    0.799657      0.821285
f1-score     0.860283    0.752089  0.821285    0.806186      0.818787
support    307.000000  191.000000  0.821285  498.000000    498.000000

Tree with max depth of 6
                    0           1  accuracy   macro avg  weighted avg
precision    0.830303    0.803571  0.821285    0.816937      0.820051
recall       0.892508    0.706806  0.821285    0.799657      0.821285
f1-score     

6. Which model performs better on your in-sample data?



7. Which model performs best on your out-of-sample data, the validate set?

### Decicion Tree with the Telco Data

1. Work through these same exercises using the Telco dataset.

2. Experiment with this model on other datasets with a higher number of output classes.

### Decicion Tree with the Other Data