# Decision Tree Exercises

In [1]:
#ignore warnings
import warnings
warnings.filterwarnings("ignore")


#calculations
import numpy as np
import pandas as pd 

#dataset
from pydataset import data

#sklearn stuff 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

#graphing
import matplotlib.pyplot as plt
import seaborn as sns

#acquire and clean data
import acquire 
import prepare

### 1 What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

#### - my baseline prediction is that men had a higher mortality rate than women in the sinking of the titanic

In [2]:
df = acquire.get_titanic_data()

In [3]:
df = df.drop(columns = ['deck', 'passenger_id'])

In [4]:
df.embark_town = df.embark_town.fillna(value=df.embark_town.mode())

In [5]:
df.age = df.age.fillna(value=df.age.median())

In [6]:
dummy_df = pd.get_dummies(df[['sex', 'embark_town']] , dummy_na = False, drop_first = [True, True])

df = df.drop(columns = ['sex', 'embark_town'])

df = pd.concat([df, dummy_df] , axis = 1)

df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embarked,class,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,S,Third,0,1,0,1
1,1,1,38.0,1,0,71.2833,C,First,0,0,0,0
2,1,3,26.0,0,0,7.925,S,Third,1,0,0,1
3,1,1,35.0,1,0,53.1,S,First,0,0,0,1
4,0,3,35.0,0,0,8.05,S,Third,1,1,0,1


In [7]:
df.drop(columns = ['embarked' , 'class'] , inplace = True, axis = 1)
df

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.2500,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.9250,1,0,0,1
3,1,1,35.0,1,0,53.1000,0,0,0,1
4,0,3,35.0,0,0,8.0500,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,1,1,0,1
887,1,1,19.0,0,0,30.0000,1,0,0,1
888,0,3,28.0,1,2,23.4500,0,0,0,1
889,1,1,26.0,0,0,30.0000,1,1,0,0


In [8]:
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


In [9]:
train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.survived)
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train.survived)


In [10]:
X_train = train.drop(columns=["survived"])
y_train = train.survived

X_validate = validate.drop(columns=["survived"])
y_validate = validate.survived

X_test = test.drop(columns=["survived"])
y_test = test.survived

In [11]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,1,36.0,0,0,40.125,1,1,0,0
165,3,9.0,0,2,20.525,0,1,0,1
50,3,7.0,4,1,39.6875,0,1,0,1
259,2,50.0,0,1,26.0,0,0,0,1
306,1,28.0,0,0,110.8833,1,0,0,0


In [12]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [13]:
base = y_train.mode()
base_prediction = (y_train == 0)
base_accuracy = base_prediction.mean()

round (base_accuracy, 2)

0.62

# question 2

In [14]:
# Make the model
tree1 = DecisionTreeClassifier(max_depth=1, random_state=123)

# Fit the model (on train and only train)
tree1 = tree1.fit(X_train, y_train)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions = tree1.predict(X_train)


In [15]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


In [16]:
pd.crosstab(y_train, y_predictions)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,265,42
1,58,133


In [17]:
print (classification_report(y_train, y_predictions))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



In [18]:
report = classification_report(y_train, y_predictions, output_dict = True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.820433,0.76,0.799197,0.790217,0.797255
recall,0.863192,0.696335,0.799197,0.779764,0.799197
f1-score,0.84127,0.726776,0.799197,0.784023,0.797358
support,307.0,191.0,0.799197,498.0,498.0


## 4 

In [19]:
TP = 265
FP = 58
FN = 42
TN = 133
ALL = TP + FP + FN + TN

In [20]:
accuracy = (TP+TN)/ALL
precision = TP / (TP + FP)
recall = TP / (TP + FN)
tp_rate = TP / (TP+FN)
fp_rate = FP / (FP+TN)
fn_rate = FN / (FN+TP)
tn_rate = TN / (TN+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN
print ('The accuracy is ' , accuracy)
print('the precision is ', precision)
print ('the recall is ' , recall)
print('the true positive rate is' , tp_rate)
print('the false positive rate is' , fp_rate)
print('the false negative rate is' , fn_rate)
print ('the true negative rate is ', tn_rate)
print('the f1 score is ', f1_score)
print( 'support positive :' , support_pos)
print('support neg: ' , support_neg)

The accuracy is  0.7991967871485943
the precision is  0.8204334365325078
the recall is  0.8631921824104235
the true positive rate is 0.8631921824104235
the false positive rate is 0.3036649214659686
the false negative rate is 0.13680781758957655
the true negative rate is  0.6963350785340314
the f1 score is  0.8412698412698413
support positive : 307
support neg:  191


# 5 

In [21]:
for i in range(2, 21):
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)
    tree = tree.fit(X_train, y_train)
    y_predictions = tree.predict(X_train)

    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()


Tree with max depth of 2
                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Tree with max depth of 3
                    0           1  accuracy   macro avg  weighted avg
precision    0.829341    0.817073  0.825301    0.823207      0.824636
recall       0.902280    0.701571  0.825301    0.801925      0.825301
f1-score     0.864275    0.754930  0.825301    0.809602      0.822337
support    307.000000  191.000000  0.825301  498.000000    498.000000

Tree with max depth of 4
                    0           1  accuracy   macro avg  weighted avg
precision    0.831858    0.842767  0.835341    0.837313      0.836042
recall       0.918567    0.701571  0.835341    0.810069      0.835341
f1-score     

### 6. model of more than 13 produces more than 99% accuracy

In [22]:
metrics = []

for i in range(2, 25):

    tree = DecisionTreeClassifier(max_depth=i, random_state=123)


    tree = tree.fit(X_train, y_train)


    in_sample_accuracy = tree.score(X_train, y_train)
    
    out_of_sample_accuracy = tree.score(X_validate, y_validate)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
df = pd.DataFrame(metrics)
df["difference"] = df.train_accuracy - df.validate_accuracy
df


Unnamed: 0,max_depth,train_accuracy,validate_accuracy,difference
0,2,0.799197,0.761682,0.037515
1,3,0.825301,0.799065,0.026236
2,4,0.835341,0.794393,0.040949
3,5,0.853414,0.799065,0.054348
4,6,0.865462,0.78972,0.075742
5,7,0.883534,0.780374,0.10316
6,8,0.899598,0.78972,0.109879
7,9,0.917671,0.799065,0.118605
8,10,0.937751,0.785047,0.152704
9,11,0.955823,0.813084,0.142739


# random forest exercises

## 1 Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [23]:
base = y_train.mode()

matches_base = y_train == 0 

base_accuracy = matches_base.mean()

round(base_accuracy, 2)

0.62

In [24]:
random_forest = RandomForestClassifier(max_depth = 1, random_state=123)

random_forest.fit(X_train, y_train)

y_predictions = random_forest.predict(X_train)


report = classification_report(y_train, y_predictions, output_dict = True)

pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.773481,0.801471,0.781124,0.787476,0.784216
recall,0.912052,0.570681,0.781124,0.741366,0.781124
f1-score,0.83707,0.666667,0.781124,0.751868,0.771715
support,307.0,191.0,0.781124,498.0,498.0


# K- Nearest Neighbor Exercises

### 1 Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)


In [144]:
knn = KNeighborsClassifier(n_neighbors = 1, weights = 'uniform')

In [145]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [146]:
y_pred = knn.predict(X_train)

In [147]:
y_pred_proba = knn.predict_proba(X_train)

In [148]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.99


# 2 Evaluate your results using the model score, confusion matrix, and classification report.

In [155]:
print(confusion_matrix(y_train, y_pred))


[[306   1]
 [  2 189]]


In [150]:
report = (classification_report(y_train, y_pred, output_dict = True))
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.993506,0.994737,0.993976,0.994122,0.993978
recall,0.996743,0.989529,0.993976,0.993136,0.993976
f1-score,0.995122,0.992126,0.993976,0.993624,0.993973
support,307.0,191.0,0.993976,498.0,498.0


### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [151]:
TP = 306
FP = 2
FN = 1
TN = 189
ALL = TP + FP + FN + TN

In [152]:
accuracy = (TP+TN)/ALL
precision = TP / (TP + FP)
recall = TP / (TP + FN)
tp_rate = TP / (TP+FN)
fp_rate = FP / (FP+TN)
fn_rate = FN / (FN+TP)
tn_rate = TN / (TN+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN
print ('The accuracy is ' , accuracy)
print('the precision is ', precision)
print ('the recall is ' , recall)
print('the true positive rate is' , tp_rate)
print('the false positive rate is' , fp_rate)
print('the false negative rate is' , fn_rate)
print ('the true negative rate is ', tn_rate)
print('the f1 score is ', f1_score)
print( 'support positive :' , support_pos)
print('support neg: ' , support_neg)

The accuracy is  0.9939759036144579
the precision is  0.9935064935064936
the recall is  0.996742671009772
the true positive rate is 0.996742671009772
the false positive rate is 0.010471204188481676
the false negative rate is 0.003257328990228013
the true negative rate is  0.9895287958115183
the f1 score is  0.9951219512195122
support positive : 307
support neg:  191


In [114]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))


Accuracy of KNN classifier on test set: 0.71


### 4 Run through steps 2-4 setting k to 10

In [156]:
knn = KNeighborsClassifier(n_neighbors = 10, weights = 'uniform')

In [157]:
knn.fit(X_train , y_train)

KNeighborsClassifier(n_neighbors=10)

In [158]:
y_pred = knn.predict(X_train)

In [159]:
y_pred_proba = knn.predict_proba(X_train)

In [160]:
print('Accuracy of KNN classifier on training set: {:.2f}'. format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.75


In [161]:
print(confusion_matrix(y_train, y_pred))

[[276  31]
 [ 95  96]]


In [162]:
report = (classification_report(y_train, y_pred, output_dict = True))
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.743935,0.755906,0.746988,0.74992,0.748526
recall,0.899023,0.502618,0.746988,0.70082,0.746988
f1-score,0.814159,0.603774,0.746988,0.708966,0.733469
support,307.0,191.0,0.746988,498.0,498.0


In [163]:
TP = 276
FP = 95
FN = 31
TN = 96
ALL = TP + FP + FN + TN

In [164]:
accuracy = (TP+TN)/ALL
precision = TP / (TP + FP)
recall = TP / (TP + FN)
tp_rate = TP / (TP+FN)
fp_rate = FP / (FP+TN)
fn_rate = FN / (FN+TP)
tn_rate = TN / (TN+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN
print ('The accuracy is ' , accuracy)
print('the precision is ', precision)
print ('the recall is ' , recall)
print('the true positive rate is' , tp_rate)
print('the false positive rate is' , fp_rate)
print('the false negative rate is' , fn_rate)
print ('the true negative rate is ', tn_rate)
print('the f1 score is ', f1_score)
print( 'support positive :' , support_pos)
print('support neg: ' , support_neg)

The accuracy is  0.7469879518072289
the precision is  0.7439353099730458
the recall is  0.8990228013029316
the true positive rate is 0.8990228013029316
the false positive rate is 0.4973821989528796
the false negative rate is 0.10097719869706841
the true negative rate is  0.5026178010471204
the f1 score is  0.8141592920353982
support positive : 307
support neg:  191


In [165]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))


Accuracy of KNN classifier on test set: 0.71


# 5 Run through steps 2-4 setting k to 20 

In [166]:
knn = KNeighborsClassifier(n_neighbors =20  , weights = 'uniform')

In [167]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [168]:
y_pred = knn.predict(X_train)

In [169]:
y_pred_proba = knn.predict_proba(X_train)

In [170]:
print('Accuracy of KNN classifier on training set:{:.2f}'
     .format(knn.score(X_train, y_train)))

Accuracy of KNN classifier on training set:0.72


In [171]:
print(confusion_matrix(y_train, y_pred))

[[269  38]
 [103  88]]


In [172]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.88      0.79       307
           1       0.70      0.46      0.56       191

    accuracy                           0.72       498
   macro avg       0.71      0.67      0.67       498
weighted avg       0.71      0.72      0.70       498



In [173]:
TP = 269
FP = 103
FN = 38
TN = 88
ALL = TP + FP + FN + TN

In [174]:
accuracy = (TP+TN)/ALL
precision = TP / (TP + FP)
recall = TP / (TP + FN)
tp_rate = TP / (TP+FN)
fp_rate = FP / (FP+TN)
fn_rate = FN / (FN+TP)
tn_rate = TN / (TN+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN
print ('The accuracy is ' , accuracy)
print('the precision is ', precision)
print ('the recall is ' , recall)
print('the true positive rate is' , tp_rate)
print('the false positive rate is' , fp_rate)
print('the false negative rate is' , fn_rate)
print ('the true negative rate is ', tn_rate)
print('the f1 score is ', f1_score)
print( 'support positive :' , support_pos)
print('support neg: ' , support_neg)

The accuracy is  0.7168674698795181
the precision is  0.7231182795698925
the recall is  0.8762214983713354
the true positive rate is 0.8762214983713354
the false positive rate is 0.5392670157068062
the false negative rate is 0.1237785016286645
the true negative rate is  0.4607329842931937
the f1 score is  0.7923416789396172
support positive : 307
support neg:  191


In [175]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

Accuracy of KNN classifier on test set: 0.72


#### 6 What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?
#### - the sample that does the best in the sample is the 10 nearest neighbors

#### 7 Which model performs best on our out-of-sample data from validate?
#### - the 20 nearest neighbors does the best in the validate set, since it has the smallest difference between the test and validate samples