In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, accuracy_score , recall_score , precision_score, f1_score


from prepare import prep_titanic , prep_telco, train_val_test
import acquire

seed = 55

## Exercises
Using the titanic data, in your classification-exercises repository, create a notebook, decision_tree.ipynb where you will do the following:

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [2]:
titanic = pd.read_csv('titanic.csv')

In [3]:
titanic = prep_titanic(titanic)

In [4]:
t_titan, v_titan, test_titan = train_val_test(titanic, 'survived')

In [5]:
t_titan.survived.mode()

0    0
Name: survived, dtype: int64

In [6]:
baseline_value = 0

In [7]:
(t_titan.survived == baseline_value).mean()

0.6163723916532905

Baseline accuracy is .62

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)



In [8]:
t_titan.head(1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
519,0,3,male,32.0,0,0,7.8958,Southampton,1


In [9]:
t_titan.age.isna().sum()

115

In [10]:
v_titan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 134 entries, 326 to 843
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     134 non-null    int64  
 1   pclass       134 non-null    int64  
 2   sex          134 non-null    object 
 3   age          107 non-null    float64
 4   sibsp        134 non-null    int64  
 5   parch        134 non-null    int64  
 6   fare         134 non-null    float64
 7   embark_town  134 non-null    object 
 8   alone        134 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 10.5+ KB


In [11]:
t_titan = pd.get_dummies(t_titan, columns = ['sex', 'embark_town'], drop_first = True) 
v_titan = pd.get_dummies(v_titan, columns = ['sex', 'embark_town'], drop_first = True) 


In [43]:
x_train = t_titan.drop(columns = 'survived')
y_train = t_titan.survived
x_val = v_titan.drop(columns = 'survived')
y_val = v_titan.survived

In [13]:
clf = dtc(random_state = seed)
clf.fit(x_train, y_train)

In [14]:
clf.feature_importances_

array([0.05258224,        nan,        nan,        nan,        nan,
              nan, 0.12920171, 0.00144462, 0.0045266 ])

In [15]:
imp = pd.DataFrame({'cols': x_train.columns,
                   'imps':clf.feature_importances_})

imp.sort_values(by = 'imps' , ascending = False)

Unnamed: 0,cols,imps
6,sex_male,0.129202
0,pclass,0.052582
8,embark_town_Southampton,0.004527
7,embark_town_Queenstown,0.001445
1,age,
2,sibsp,
3,parch,
4,fare,
5,alone,


In [16]:
train_predict = clf.predict(x_train)
train_predict[:5]

array([0, 0, 1, 1, 0])

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [21]:
clf.score(x_train, y_train)

0.8908507223113965

-- My model has an 89% accuracy on the train dataset when there is no limit.

In [23]:
print(classification_report(y_train, train_predict))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92       384
           1       0.93      0.77      0.84       239

    accuracy                           0.89       623
   macro avg       0.90      0.87      0.88       623
weighted avg       0.90      0.89      0.89       623



In [24]:
pd.crosstab(y_train,train_predict)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,371,13
1,55,184


#### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

True positive = 184

False Posistive = 13

True Negative = 371

False Negative = 55

In [30]:
 recall_score(y_train, train_predict)

0.7698744769874477

In [31]:
accuracy_score(y_train, train_predict)

0.8908507223113965

In [32]:
precision_score(y_train, train_predict)

0.934010152284264

In [35]:
f1_score(y_train, train_predict)

0.8440366972477066

In [37]:
print(classification_report(y_train, train_predict))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92       384
           1       0.93      0.77      0.84       239

    accuracy                           0.89       623
   macro avg       0.90      0.87      0.88       623
weighted avg       0.90      0.89      0.89       623



In [46]:
# Comparing train data to validate data

train_acc = []
val_acc = []
depth = []

for i in range(2, 30):
    clf = dtc(max_depth = i , random_state = seed)
    clf.fit(x_train, y_train)
    
    train_acc.append(clf.score(x_train, y_train))
    val_acc.append(clf.score(x_val,y_val))
    depth.append(clf.score(x_val, y_val))

In [47]:
trees = pd.DataFrame({'max_depth': depth,
                      'train_acc': train_acc,
                      'val_acc': val_acc})
trees

Unnamed: 0,max_depth,train_acc,val_acc
0,0.798507,0.776886,0.798507
1,0.783582,0.797753,0.783582
2,0.783582,0.805778,0.783582
3,0.768657,0.820225,0.768657
4,0.783582,0.837881,0.783582
5,0.791045,0.852327,0.791045
6,0.791045,0.857143,0.791045
7,0.761194,0.865169,0.761194
8,0.768657,0.868379,0.768657
9,0.776119,0.868379,0.776119


#### 5. Run through steps 2-4 using a different max_depth value.

In [49]:
clf = dtc(random_state = seed, max_depth=2)

In [50]:
clf.fit(x_train, y_train)

In [51]:
train_predict = clf.predict(x_train)
train_predict[:5]

array([0, 0, 1, 1, 0])

In [54]:
val_predict = clf.predict(x_val)
val_predict[:5]

array([0, 0, 1, 0, 0])

In [53]:
pd.crosstab(y_train,train_predict)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,323,61
1,78,161


In [55]:
pd.crosstab(y_val,val_predict)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,73,9
1,18,34


Train, Validate True positive == 161 , 34

Train, Validate False positive == 61 , 9

Train, Validate True negative == 323 , 73

Train, Validate False negative = 78 , 18

In [57]:
print(classification_report(y_train, train_predict))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82       384
           1       0.73      0.67      0.70       239

    accuracy                           0.78       623
   macro avg       0.77      0.76      0.76       623
weighted avg       0.77      0.78      0.78       623



In [58]:
print(classification_report(y_val, val_predict))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84        82
           1       0.79      0.65      0.72        52

    accuracy                           0.80       134
   macro avg       0.80      0.77      0.78       134
weighted avg       0.80      0.80      0.79       134



6. Which model performs better on your in-sample data?

The first one. 89% accuracy against in sample data.

7. Which model performs best on your out-of-sample data, the validate set?

Second one. I chose the depth of two because it is the best on validate and closely mirrors training.

### Telco Dataset

1. Work through these same exercises using the Telco dataset.

In [66]:
telco = acquire.get_telco_data()

In [68]:
telco = prep_telco(telco)

In [73]:
telco.churn.value_counts()

churn
No     5174
Yes    1869
Name: count, dtype: int64

In [74]:
baseline_value = 'No' 

In [115]:
(telco.churn == baseline_value).mean()

0.7346301292063041

Baseline accuracy is .73

In [125]:
t_tel , v_tel , test_tel = train_val_test(telco,'churn',seed=55)

In [None]:
# Debugging get dummies

In [None]:
# t_tel_drop = t_tel
'''
'gender', 'partner','dependents','phone_service','multiple_lines',
                                       'online_security','online_backup','device_protection','tech_support',
                                       'streaming_tv','streaming_movies','paperless_billing','total_charges',
                                      'churn','contract_type','internet_service_type','payment_type'
            
'''

In [141]:
t_tel , v_tel , test_tel = train_val_test(telco,'churn',seed=55)
t_tel_drop = t_tel
t_tel = pd.get_dummies(t_tel, columns=['gender',  'partner', 'dependents', 'phone_service', 
                                       'multiple_lines','online_security','online_backup',
                                       'device_protection', 'tech_support','streaming_tv',
                                      'streaming_movies', 'paperless_billing', 'total_charges',
                                      'churn', 'contract_type','internet_service_type','payment_type'])
t_tel_drop = pd.get_dummies(t_tel_drop, columns=['gender',  'partner', 'dependents', 'phone_service', 
                                       'multiple_lines','online_security','online_backup',
                                       'device_protection', 'tech_support','streaming_tv',
                                      'streaming_movies', 'paperless_billing', 'total_charges',
                                      'churn', 'contract_type','internet_service_type','payment_type'])
v_tel = pd.get_dummies(v_tel, columns=['gender',  'partner', 'dependents', 'phone_service', 
                                       'multiple_lines','online_security','online_backup',
                                       'device_protection', 'tech_support','streaming_tv',
                                      'streaming_movies', 'paperless_billing', 'total_charges',
                                      'churn', 'contract_type','internet_service_type','payment_type'])

t_tel.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,dependents_Yes,phone_service_No,...,churn_Yes,contract_type_Month-to-month,contract_type_One year,contract_type_Two year,internet_service_type_DSL,internet_service_type_Fiber optic,payment_type_Bank transfer (automatic),payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
2198,0,69,24.95,True,False,False,True,True,False,False,...,False,False,False,True,False,False,False,True,False,False
304,0,48,45.3,True,False,False,True,True,False,True,...,True,True,False,False,True,False,True,False,False,False
3583,0,59,24.45,True,False,True,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
859,0,43,50.2,True,False,False,True,False,True,False,...,False,True,False,False,True,False,False,False,False,True
662,0,37,19.8,False,True,False,True,False,True,False,...,False,False,True,False,False,False,False,True,False,False


In [None]:
# Get dumies will not work
'''
t_tel = pd.get_dummies(t_tel, columns=['gender', 'partner','dependents','phone_service','multiple_lines',
                                       'online_security','online_backup','device_protection','tech_support',
                                       'streaming_tv','streaming_movies','paperless_billing','total_charges',
                                      'churn','contract_type','internet_service_type','payment_type'])
t_tel_drop = pd.get_dummies(t_tel_drop, columns=['gender', 'partner','dependents','phone_service','multiple_lines',
                                       'online_security','online_backup','device_protection','tech_support',
                                       'streaming_tv','streaming_movies','paperless_billing','total_charges',
                                      'churn','contract_type','internet_service_type','payment_type'],drop_first = True)
v_tel = pd.get_dummies(v_tel, columns=['gender', 'partner','dependents','phone_service','multiple_lines',
                                       'online_security','online_backup','device_protection','tech_support',
                                       'streaming_tv','streaming_movies','paperless_billing','total_charges',
                                      'churn','contract_type','internet_service_type','payment_type'])
'''


In [112]:
x_train = t_tel.drop(columns = 'churn')
y_train = t_tel.churn
x_val = v_tel.drop(columns = 'churn')
y_val = v_tel.churn

In [113]:
x_train.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,contract_type,internet_service_type,payment_type
2198,Female,0,Yes,No,69,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Yes,24.95,1718.35,Two year,,Credit card (automatic)
304,Female,0,Yes,No,48,No,No phone service,Yes,No,Yes,No,No,Yes,Yes,45.3,2145.0,Month-to-month,DSL,Bank transfer (automatic)
3583,Female,0,No,No,59,Yes,Yes,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,24.45,1493.1,Two year,,Electronic check
859,Female,0,Yes,Yes,43,Yes,No,No,Yes,No,No,No,No,No,50.2,2169.4,Month-to-month,DSL,Mailed check
662,Male,0,Yes,Yes,37,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,No,19.8,677.05,One year,,Credit card (automatic)


In [110]:
clf = dtc(random_state = seed, max_depth=5)