In [1]:
import acquire
import numpy as np
import pandas as pd
from env import username, password, host
import os
from sklearn import datasets
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import warnings
warnings.filterwarnings("ignore")

from pydataset import data

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

# Titanic Dataset

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [2]:
titanic = acquire.grab_titanic_data()
titanic.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
titanic = acquire.prep_titanic(titanic)
titanic.rename(columns={'sex': 'isMale'}, inplace=True)
titanic.head()

Unnamed: 0,survived,pclass,isMale,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22.0,1,0,7.25,1.0,0
1,1,1,female,38.0,1,0,71.2833,2.0,0
2,1,3,female,26.0,0,0,7.925,1.0,1
3,1,1,female,35.0,1,0,53.1,1.0,0
4,0,3,male,35.0,0,0,8.05,1.0,1


In [4]:
print(titanic.age.mean())
print(titanic.age.mode())
print(titanic.age.median())

29.69911764705882
0    24.0
Name: age, dtype: float64
28.0


In [5]:
titanic.age[titanic.age.isnull()] = 28
titanic.head(10)

Unnamed: 0,survived,pclass,isMale,age,sibsp,parch,fare,embark_town,alone
0,0,3,male,22.0,1,0,7.25,1.0,0
1,1,1,female,38.0,1,0,71.2833,2.0,0
2,1,3,female,26.0,0,0,7.925,1.0,1
3,1,1,female,35.0,1,0,53.1,1.0,0
4,0,3,male,35.0,0,0,8.05,1.0,1
5,0,3,male,28.0,0,0,8.4583,3.0,1
6,0,1,male,54.0,0,0,51.8625,1.0,1
7,0,3,male,2.0,3,1,21.075,1.0,0
8,1,3,female,27.0,0,2,11.1333,1.0,0
9,1,2,female,14.0,1,0,30.0708,2.0,0


In [6]:
#titanic[titanic.age == 'NaN'] = 28

In [7]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   isMale       891 non-null    object 
 3   age          891 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embark_town  889 non-null    float64
 8   alone        891 non-null    int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 62.8+ KB


In [8]:
titanic.embark_town.value_counts()

1.0    644
2.0    168
3.0     77
Name: embark_town, dtype: int64

In [9]:
titanic.embark_town[titanic.embark_town.isnull()] = 1.0

In [10]:
titanic.isMale[titanic.isMale == 'male'] = 1
titanic.isMale[titanic.isMale == 'female'] = 0
titanic.head(10)

Unnamed: 0,survived,pclass,isMale,age,sibsp,parch,fare,embark_town,alone
0,0,3,1,22.0,1,0,7.25,1.0,0
1,1,1,0,38.0,1,0,71.2833,2.0,0
2,1,3,0,26.0,0,0,7.925,1.0,1
3,1,1,0,35.0,1,0,53.1,1.0,0
4,0,3,1,35.0,0,0,8.05,1.0,1
5,0,3,1,28.0,0,0,8.4583,3.0,1
6,0,1,1,54.0,0,0,51.8625,1.0,1
7,0,3,1,2.0,3,1,21.075,1.0,0
8,1,3,0,27.0,0,2,11.1333,1.0,0
9,1,2,0,14.0,1,0,30.0708,2.0,0


In [11]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   isMale       891 non-null    object 
 3   age          891 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embark_town  891 non-null    float64
 8   alone        891 non-null    int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 62.8+ KB


In [12]:
t_train, t_validate, t_test = acquire.split(titanic)
t_train.shape, t_validate.shape, t_test.shape

((534, 9), (178, 9), (179, 9))

In [13]:
(t_train.survived == 1).mean()

0.40074906367041196

## What is your baseline prediction?  What is your baseline accuracy? 

In [14]:
t_train['baseline'] = 0

In [15]:
t_train.baseline.value_counts()

0    534
Name: baseline, dtype: int64

In [16]:
#t_train = t_train.drop(columns='Unnamed: 0')
t_train.head()

Unnamed: 0,survived,pclass,isMale,age,sibsp,parch,fare,embark_town,alone,baseline
304,0,3,1,28.0,0,0,8.05,1.0,1,0
718,0,3,1,28.0,0,0,15.5,3.0,1,0
102,0,1,1,21.0,0,1,77.2875,1.0,0,0
737,1,1,1,35.0,0,0,512.3292,2.0,1,0
269,1,1,0,35.0,0,0,135.6333,1.0,1,0


In [17]:
print(f'The Baseline accuracy is: {metrics.accuracy_score(t_train.survived, t_train.baseline)}, This is for non-survival')

The Baseline accuracy is: 0.599250936329588, This is for non-survival


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [18]:
#t_train.sex[t_train.sex == 'male'] = 1
#t_train.sex[t_train.sex == 'female'] = 0
#t_train[t_train.sex == 'female'] = 0
#t_train.rename(columns={'sex': 'isMale'})
t_train.head()

Unnamed: 0,survived,pclass,isMale,age,sibsp,parch,fare,embark_town,alone,baseline
304,0,3,1,28.0,0,0,8.05,1.0,1,0
718,0,3,1,28.0,0,0,15.5,3.0,1,0
102,0,1,1,21.0,0,1,77.2875,1.0,0,0
737,1,1,1,35.0,0,0,512.3292,2.0,1,0
269,1,1,0,35.0,0,0,135.6333,1.0,1,0


In [19]:
#t_train = t_train.rename(columns={'sex': 'isMale'})

In [20]:
x_tr = t_train.drop(columns='survived')
y_tr = t_train.survived

x_val = t_validate.drop(columns='survived')
y_val = t_validate.survived

x_test = t_test.drop(columns='survived')
y_test = t_test.survived

x_tr.head()

Unnamed: 0,pclass,isMale,age,sibsp,parch,fare,embark_town,alone,baseline
304,3,1,28.0,0,0,8.05,1.0,1,0
718,3,1,28.0,0,0,15.5,3.0,1,0
102,1,1,21.0,0,1,77.2875,1.0,0,0
737,1,1,35.0,0,0,512.3292,2.0,1,0
269,1,0,35.0,0,0,135.6333,1.0,1,0


In [21]:
x_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 304 to 133
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pclass       534 non-null    int64  
 1   isMale       534 non-null    object 
 2   age          534 non-null    float64
 3   sibsp        534 non-null    int64  
 4   parch        534 non-null    int64  
 5   fare         534 non-null    float64
 6   embark_town  534 non-null    float64
 7   alone        534 non-null    int64  
 8   baseline     534 non-null    int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 41.7+ KB


In [22]:
t_classf = DecisionTreeClassifier()
t_classf = t_classf.fit(x_tr, y_tr)

y_predict = t_classf.predict(x_tr)
y_predict_prob = t_classf.predict_proba(x_tr)

In [23]:
t_classf.fit(x_tr, y_tr)

DecisionTreeClassifier()

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [24]:
print(f'Model score: {t_classf.score(x_tr, y_tr)}')

Model score: 0.9868913857677902


In [25]:
cm = confusion_matrix(y_tr, y_predict)
pd.DataFrame(cm)

Unnamed: 0,0,1
0,319,1
1,6,208


In [26]:
pd.DataFrame(classification_report(y_tr, y_predict, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.981538,0.995215,0.986891,0.988377,0.987019
recall,0.996875,0.971963,0.986891,0.984419,0.986891
f1-score,0.989147,0.983452,0.986891,0.986299,0.986865
support,320.0,214.0,0.986891,534.0,534.0


### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

##### Accuracy:

In [27]:
accuracy_score(y_tr, y_predict)

0.9868913857677902

##### TP, TN, FP, FN rates: 

In [28]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]
print(f'True Positive rate:  {TP/(TP+TN+FP+FN)}')
print(f'False Positive rate: {FP/(TP+TN+FP+FN)}')
print(f'True Negative rate:  {TN/(TP+TN+FP+FN)}')
print(f'False Negative rate: {FN/(TP+TN+FP+FN)}')

True Positive rate:  0.5973782771535581
False Positive rate: 0.0018726591760299626
True Negative rate:  0.3895131086142322
False Negative rate: 0.011235955056179775


##### Precision, Recall, f1-score and support:

In [29]:
print(f'Precision: {precision_score(y_tr, y_predict)}')
print(f'Recall:    {recall_score(y_tr, y_predict)}')
print(f'f1:        {f1_score(y_tr, y_predict)}')
print(f'Support -  0: {TP+FP}')
print(f'           1: {TN+FN}')

Precision: 0.9952153110047847
Recall:    0.9719626168224299
f1:        0.9834515366430261
Support -  0: 320
           1: 214


### 5. Run through steps 2-4 using a different max_depth value.

In [30]:
t_classf2 = DecisionTreeClassifier(max_depth=4)
t_classf2 = t_classf2.fit(x_tr, y_tr)

y_predict2 = t_classf2.predict(x_tr)
y_predict_prob2 = t_classf2.predict_proba(x_tr)

In [36]:
print(f'Model score: {t_classf2.score(x_tr, y_tr)*100}')

Model score: 82.77153558052434


In [37]:
cm2 = confusion_matrix(y_tr, y_predict2)
pd.DataFrame(cm2)

Unnamed: 0,0,1
0,313,7
1,85,129


In [44]:
pd.DataFrame(classification_report(y_tr, y_predict2, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.786432,0.948529,0.827715,0.867481,0.851392
recall,0.978125,0.602804,0.827715,0.790464,0.827715
f1-score,0.871866,0.737143,0.827715,0.804505,0.817876
support,320.0,214.0,0.827715,534.0,534.0


In [45]:
print(f'Accuracy: {accuracy_score(y_tr, y_predict2)}')

Accuracy: 0.8277153558052435


In [46]:
TP2 = cm2[0,0]
TN2 = cm2[1,1]
FP2 = cm2[0,1]
FN2 = cm2[1,0]
print('TP, TN, FP, FN rates:')
print(f'True Positive rate:  {TP2/(TP2+TN2+FP2+FN2)}')
print(f'False Positive rate: {FP2/(TP2+TN2+FP2+FN2)}')
print(f'True Negative rate:  {TN2/(TP2+TN2+FP2+FN2)}')
print(f'False Negative rate: {FN2/(TP2+TN2+FP2+FN2)}')

TP, TN, FP, FN rates:
True Positive rate:  0.5861423220973783
False Positive rate: 0.013108614232209739
True Negative rate:  0.24157303370786518
False Negative rate: 0.15917602996254682


In [47]:
print(f'Precision: {precision_score(y_tr, y_predict2)}')
print(f'Recall:    {recall_score(y_tr, y_predict2)}')
print(f'f1:        {f1_score(y_tr, y_predict2)}')
print(f'Support -  0: {TP2+FP2}')
print(f'           1: {TN2+FN2}')

Precision: 0.9485294117647058
Recall:    0.602803738317757
f1:        0.7371428571428571
Support -  0: 320
           1: 214


In [48]:
print(f'Precision: {precision_score(y_tr, y_predict)}')
print(f'Recall:    {recall_score(y_tr, y_predict)}')
print(f'f1:        {f1_score(y_tr, y_predict)}')
print(f'Support -  0: {TP+FP}')
print(f'           1: {TN+FN}')

Precision: 0.9952153110047847
Recall:    0.9719626168224299
f1:        0.9834515366430261
Support -  0: 320
           1: 214


### 6. Which model performs better on your in-sample data?

Model -

### 7. Which model performs best on your out-of-sample data, the validate set?

In [None]:
y_predict = t_classf.predict(x_val)
y_predict2 = t_classf2.predict(x_val)

print(classification_report(y_val, y_predict))

print(classification_report(y_val, y_predict2))

### 1. Work through these same exercises using the Telco dataset.

In [None]:
telco = acquire.grab_telco_data()

In [None]:
telco = acquire.prep_telco(telco)
telco.head()

In [None]:
tel_train, tel_validate, tel_test = acquire.split(telco)
tel_train.shape, tel_validate.shape, tel_test.shape

### 2. Experiment with this model on other datasets with a higher number of output classes.

### Iris:

In [None]:
iris = acquire.grab_iris_data()
iris.head()

In [None]:
iris = acquire.prep_iris(iris)
iris.head()

In [None]:
iris.drop(columns={'species_id', 'Unnamed: 0'})

In [None]:
i_train, i_validate, i_test = acquire.split(iris)
i_train.shape, i_validate.shape, i_test.shape

In [None]:
# find the baseline and the baseline accuracy
baseline = i_train.species.value_counts().idxmax()
print('The most common value (our baseline) is:', baseline)
print('The rate of occurance (our baseline accuracy) is:', len(i_train[i_train.species == baseline]) / len(i_train) * 100)

In [None]:
# split each of our datasets into X and y 
x_train = i_train.drop(columns=('species'))
y_train = i_train.species

x_validate = i_validate.drop(columns=('species'))
y_validate = i_validate.species

x_test = i_test.drop(columns='species')
y_test = i_test.species

In [None]:
# create and fit the models
clf1 = DecisionTreeClassifier(max_depth=8)
clf2 = DecisionTreeClassifier(max_depth=4)

clf1 = clf1.fit(X_train, y_train)
clf2 = clf2.fit(X_train, y_train)

y_pred1 = clf1.predict(X_train)
y_pred2 = clf2.predict(X_train)

y_pred_prob1 = clf1.predict_proba(X_train)
y_pred_prob2 = clf2.predict_proba(X_train)

In [None]:
# model score
print(f'Model 1 training score: {clf1.score(X_train, y_train):.2%}')
print(f'Model 2 training score: {clf2.score(X_train, y_train):.2%}')

In [None]:
# confusion matrix
cm1 = confusion_matrix(y_train, y_pred1)
cm2 = confusion_matrix(y_train, y_pred2)
print('----Model 1----')
print(pd.DataFrame(cm1))
print('')
print('----Model 2----')
print(pd.DataFrame(cm2))

In [None]:
# classification report
print(classification_report(y_train, y_pred1))
print(classification_report(y_train, y_pred2))

## ...and comparing with validation data

In [None]:
# classification report
y_predict1 = clf1.predict(X_validate)
y_predict2 = clf2.predict(X_validate)

print(classification_report(y_val, y_predict1))

print(classification_report(y_val, y_predict2))

# Experiment with this model on other datasets with a higher number of output classes.


# Random Forest

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [None]:
titanic2 = acquire.grab_titanic_data()

In [None]:
titanic2 = acquire.prep_titanic(titanic2)
titanic2.head()

In [None]:
titanic2.rename(columns={'sex': 'isMale'}, inplace=True)
titanic2.head(3)

In [None]:
#type(titanic2)

In [None]:
titanic2.isMale[titanic2.isMale == 'male'] = 1
titanic2.isMale[titanic2.isMale == 'female'] = 0
titanic2.head(3)

In [None]:
titanic2.age[titanic2.age.isnull()] = 28
titanic2.embark_town[titanic2.embark_town.isnull()] = 1.0

In [None]:
titanic2.info()

In [None]:
my_forest = RandomForestClassfifier(max_depth=10, random_state=248)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?