# Decision Tree Exercises

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import acquire 
import prepare

### 1 What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

#### - my baseline prediction is that men had a higher mortality rate than women in the sinking of the titanic

In [2]:
df = acquire.get_titanic_data()

In [3]:
df = df.drop(columns = 'deck')

In [4]:
df.embark_town = df.embark_town.fillna(value=df.embark_town.mode())

In [5]:
df.age = df.age.fillna(value=df.age.median())

In [6]:
dummy_df = pd.get_dummies(df[['sex', 'embark_town']] , dummy_na = False, drop_first = [True, True])

df = df.drop(columns = ['sex', 'embark_town'])

df = pd.concat([df, dummy_df] , axis = 1)

df.head()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,embarked,class,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,22.0,1,0,7.25,S,Third,0,1,0,1
1,1,1,1,38.0,1,0,71.2833,C,First,0,0,0,0
2,2,1,3,26.0,0,0,7.925,S,Third,1,0,0,1
3,3,1,1,35.0,1,0,53.1,S,First,0,0,0,1
4,4,0,3,35.0,0,0,8.05,S,Third,1,1,0,1


In [7]:
df.drop(columns = ['embarked' , 'class'] , inplace = True, axis = 1)
df

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,22.0,1,0,7.2500,0,1,0,1
1,1,1,1,38.0,1,0,71.2833,0,0,0,0
2,2,1,3,26.0,0,0,7.9250,1,0,0,1
3,3,1,1,35.0,1,0,53.1000,0,0,0,1
4,4,0,3,35.0,0,0,8.0500,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,27.0,0,0,13.0000,1,1,0,1
887,887,1,1,19.0,0,0,30.0000,1,0,0,1
888,888,0,3,28.0,1,2,23.4500,0,0,0,1
889,889,1,1,26.0,0,0,30.0000,1,1,0,0


In [8]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,1,38.0,1,0,71.2833,0,0,0,0
2,2,1,3,26.0,0,0,7.925,1,0,0,1
3,3,1,1,35.0,1,0,53.1,0,0,0,1
4,4,0,3,35.0,0,0,8.05,1,1,0,1


In [9]:
train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)


In [10]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [11]:
y_train[0:10]

583    0
165    1
50     0
259    1
306    1
308    0
314    0
883    0
459    0
180    0
Name: survived, dtype: int64

In [12]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [13]:
base = y_train.mode()
base_prediction = (y_train == 0)
base_accuracy = base_prediction.mean()

round (base_accuracy, 2)

0.62

# question 2

In [43]:
# Make the model
tree1 = DecisionTreeClassifier(max_depth=1, random_state=123)

# Fit the model (on train and only train)
tree1 = tree1.fit(X_train, y_train)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions = tree1.predict(X_train)


In [44]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


In [45]:
pd.crosstab(y_train, y_predictions)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,265,42
1,58,133


In [46]:
print (classification_report(y_train, y_predictions))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



In [47]:
report = classification_report(y_train, y_predictions, output_dict = True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.820433,0.76,0.799197,0.790217,0.797255
recall,0.863192,0.696335,0.799197,0.779764,0.799197
f1-score,0.84127,0.726776,0.799197,0.784023,0.797358
support,307.0,191.0,0.799197,498.0,498.0


## 4 

In [52]:
TP = 265
FP = 58
FN = 42
TN = 133
ALL = TP + FP + FN + TN

In [53]:
accuracy = (TP+TN)/ALL
precision = TP / (TP + FP)
recall = TP / (TP + FN)
tp_rate = TP / (TP+FN)
fp_rate = FP / (FP+TN)
fn_rate = FN / (FN+TP)
tn_rate = TN / (TN+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN
print ('The accuracy is ' , accuracy)
print('the precision is ', precision)
print ('the recall is ' , recall)
print('the true positive rate is' , tp_rate)
print('the false positive rate is' , fp_rate)
print('the false negative rate is' , fn_rate)
print ('the true negative rate is ', tn_rate)
print('the f1 score is ', f1_score)
print( 'support positive :' , support_pos)
print('support neg: ' , support_neg)

The accuracy is  0.7991967871485943
the precision is  0.8204334365325078
the recall is  0.8631921824104235
the true positive rate is 0.8631921824104235
the false positive rate is 0.3036649214659686
the false negative rate is 0.13680781758957655
the true negative rate is  0.6963350785340314
the f1 score is  0.8412698412698413
support positive : 307
support neg:  191


# 5 

In [58]:
for i in range(2, 21):
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)
    tree = tree.fit(X_train, y_train)
    y_predictions = tree.predict(X_train)

    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()


Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.829341    0.817073  0.825301    0.823207      0.824636
recall       0.902280    0.701571  0.825301    0.801925      0.825301
f1-score     0.864275    0.754930  0.825301    0.809602      0.822337
support    307.000000  191.000000  0.825301  498.000000    498.000000

Tree with max depth of 4
                    0           1  accuracy   macro avg  weighted avg
precision    0.853211    0.836257   0.84739    0.844734      0.846709
recall       0.908795    0.748691   0.84739    0.828743      0.847390
f1-score     

### 6. model of more than 13 produces more than 99% accuracy

In [59]:
metrics = []

for i in range(2, 25):

    tree = DecisionTreeClassifier(max_depth=i, random_state=123)


    tree = tree.fit(X_train, y_train)


    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df


Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,2,0.799197,0.761682,0.037515
1,3,0.825301,0.799065,0.026236
2,4,0.84739,0.78972,0.05767
3,5,0.861446,0.771028,0.090418
4,6,0.875502,0.780374,0.095128
5,7,0.89759,0.771028,0.126562
6,8,0.923695,0.780374,0.143321
7,9,0.951807,0.766355,0.185452
8,10,0.961847,0.766355,0.195492
9,11,0.977912,0.771028,0.206884
