In [1]:
# General DS Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# Decision Tree and Model Evaluation Imports
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

In [3]:
# my acquire and prepare file
import acquire
import prepare

## Exercises
Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [4]:
### Acquire
titanic_df = acquire.get_titanic_data()
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [5]:
#Prepare titanic data
titanic_df = prepare.prep_titanic(titanic_df)
titanic_df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embarked,alone,is_female,embark_Cherbourg,embark_Queenstown,embark_Southampton
0,0,3,22.0,1,0,7.25,S,0,0,0,0,1
1,1,1,38.0,1,0,71.2833,C,0,1,1,0,0
2,1,3,26.0,0,0,7.925,S,1,1,0,0,1
3,1,1,35.0,1,0,53.1,S,0,1,0,0,1
4,0,3,35.0,0,0,8.05,S,1,0,0,0,1


In [6]:
#Drop Columns not needed for modeling
titanic_df.drop(['embarked'], axis=1, inplace=True)

In [7]:
titanic_df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embark_Cherbourg,embark_Queenstown,embark_Southampton
0,0,3,22.0,1,0,7.25,0,0,0,0,1
1,1,1,38.0,1,0,71.2833,0,1,1,0,0
2,1,3,26.0,0,0,7.925,1,1,0,0,1
3,1,1,35.0,1,0,53.1,0,1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,0,0,1


### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [8]:
#Train validate test split
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [9]:
# split into train, validate, test
train, validate, test = train_validate_test_split(titanic_df, target='survived', seed=123)

In [None]:
#check shape

In [10]:
# create X & y version of train, where y is a series with just the target variable and X are all the features.
X_train = train.drop(columns=['survived'])
y_train = train.survived
train.survived

450    0
543    1
157    0
462    0
397    0
      ..
820    1
673    1
310    1
72     0
749    0
Name: survived, Length: 398, dtype: int64

In [11]:
#Validate and Test 
X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [12]:
#Find Baseline
y_train.value_counts()
#Baseline is 0, did not survive

0    237
1    161
Name: survived, dtype: int64

In [13]:
#Establish Baseline
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 398 entries, 450 to 749
Series name: survived
Non-Null Count  Dtype
--------------  -----
398 non-null    int64
dtypes: int64(1)
memory usage: 6.2 KB


####  baseline prediction? 

In [14]:
#Baseline Accuracy
(y_train == 0).mean()

0.5954773869346733

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [15]:
tree = DecisionTreeClassifier(max_depth=3, random_state=123)

In [16]:
# model.fit(X, y)

tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=123)

In [17]:
print(export_text(tree, feature_names=X_train.columns.tolist()))

|--- is_female <= 0.50
|   |--- age <= 6.50
|   |   |--- sibsp <= 3.00
|   |   |   |--- class: 1
|   |   |--- sibsp >  3.00
|   |   |   |--- class: 0
|   |--- age >  6.50
|   |   |--- pclass <= 1.50
|   |   |   |--- class: 0
|   |   |--- pclass >  1.50
|   |   |   |--- class: 0
|--- is_female >  0.50
|   |--- pclass <= 2.50
|   |   |--- age <= 2.50
|   |   |   |--- class: 0
|   |   |--- age >  2.50
|   |   |   |--- class: 1
|   |--- pclass >  2.50
|   |   |--- age <= 5.50
|   |   |   |--- class: 1
|   |   |--- age >  5.50
|   |   |   |--- class: 0



In [None]:
# Visualize the tree NOT WORKING
plt.figure(figsize=(12, 7))
plot_tree(tree, feature_names=X_train.columns, class_names=y_train.unique())
plt.show()

In [19]:
#Use your model to make predictions on the in-sample data
tree.predict(X_train)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,

In [20]:
predictions = tree.predict(X_train)
actual = y_train

In [50]:
#Baseline accuracy
(y_train == 0).mean()

0.5954773869346733

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [21]:
confusion_matrix(actual, predictions)

array([[230,   7],
       [ 56, 105]])

In [22]:
pd.crosstab(actual, predictions)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,230,7
1,56,105


In [23]:
print(classification_report(actual, predictions))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88       237
           1       0.94      0.65      0.77       161

    accuracy                           0.84       398
   macro avg       0.87      0.81      0.82       398
weighted avg       0.86      0.84      0.83       398



### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [24]:
#On validate data
predictions = tree.predict(X_validate)
actual = y_validate

print(classification_report(actual, predictions))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84       102
           1       0.88      0.52      0.65        69

    accuracy                           0.78       171
   macro avg       0.81      0.74      0.75       171
weighted avg       0.80      0.78      0.76       171



In [26]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.84


In [27]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
      .format(tree.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.78


In [32]:
acc = tree.score(X_train, y_train)
acc

0.8417085427135679

In [None]:
#Could calculate this way, need to define y predictions
#TN, FP, FN, TP = confusion_matrix(y_train, y_predictions).ravel()
#TP, FP, FN, TP

### 5. Run through steps 2-4 using a different max_depth value.

#### Using Tree 2, 4 levels of depth

In [34]:
tree2 = DecisionTreeClassifier(max_depth=4, random_state=123)

In [35]:
# model.fit(X, y)

tree2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4, random_state=123)

In [36]:
print(export_text(tree, feature_names=X_train.columns.tolist()))

|--- is_female <= 0.50
|   |--- age <= 6.50
|   |   |--- sibsp <= 3.00
|   |   |   |--- class: 1
|   |   |--- sibsp >  3.00
|   |   |   |--- class: 0
|   |--- age >  6.50
|   |   |--- pclass <= 1.50
|   |   |   |--- class: 0
|   |   |--- pclass >  1.50
|   |   |   |--- class: 0
|--- is_female >  0.50
|   |--- pclass <= 2.50
|   |   |--- age <= 2.50
|   |   |   |--- class: 0
|   |   |--- age >  2.50
|   |   |   |--- class: 1
|   |--- pclass >  2.50
|   |   |--- age <= 5.50
|   |   |   |--- class: 1
|   |   |--- age >  5.50
|   |   |   |--- class: 0



In [38]:
#Use your model to make predictions on the in-sample data
tree2.predict(X_train)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,

In [39]:
predictions = tree2.predict(X_train)
actual = y_train

#### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [40]:
confusion_matrix(actual, predictions)

array([[231,   6],
       [ 55, 106]])

In [41]:
pd.crosstab(actual, predictions)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,231,6
1,55,106


In [42]:
print(classification_report(actual, predictions))

              precision    recall  f1-score   support

           0       0.81      0.97      0.88       237
           1       0.95      0.66      0.78       161

    accuracy                           0.85       398
   macro avg       0.88      0.82      0.83       398
weighted avg       0.86      0.85      0.84       398



In [43]:
#Calculating from validation data
predictions = tree2.predict(X_validate)
actual = y_validate

print(classification_report(actual, predictions))

              precision    recall  f1-score   support

           0       0.74      0.94      0.83       102
           1       0.86      0.52      0.65        69

    accuracy                           0.77       171
   macro avg       0.80      0.73      0.74       171
weighted avg       0.79      0.77      0.76       171



In [45]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.85


In [46]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
      .format(tree2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.77


### 6. Which model performs better on your in-sample data?

In [48]:
#Tree2 (model 2) performs better on my sample (training) data

### 7. Which model performs best on your out-of-sample data, the validate set?


In [None]:
#Tree 1 (model 1) performs better on the validate data

## CONCLUSIONS: By changing max depth to four levels, we increased accuracy slightly, but also made a more precise model (model 2 aka 'Tree 2' had a much lower false positive rate than model 1).

--------------------###------------------

## TELCO DATA SET

In [67]:
### Acquire
telco_df = acquire.get_telco_data()
telco_df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [68]:
#Prep telco data
def prep_telco_data(df):
    # Drop duplicate columns
    df.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], inplace=True)
       
    # Drop null values stored as whitespace    
    df['total_charges'] = df['total_charges'].str.strip()
    df = df[df.total_charges != '']
    
    # Convert to correct datatype
    df['total_charges'] = df.total_charges.astype(float)
    
    # Convert binary categorical variables to numeric
    df['gender_encoded'] = df.gender.map({'Female': 1, 'Male': 0})
    df['partner_encoded'] = df.partner.map({'Yes': 1, 'No': 0})
    df['dependents_encoded'] = df.dependents.map({'Yes': 1, 'No': 0})
    df['phone_service_encoded'] = df.phone_service.map({'Yes': 1, 'No': 0})
    df['paperless_billing_encoded'] = df.paperless_billing.map({'Yes': 1, 'No': 0})
    df['churn_encoded'] = df.churn.map({'Yes': 1, 'No': 0})
    
    # Get dummies for non-binary categorical variables
    dummy_df = pd.get_dummies(df[['multiple_lines', \
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type']], dummy_na=False, \
                              drop_first=True)
    
    # Concatenate dummy dataframe to original 
    df = pd.concat([df, dummy_df], axis=1)
    
    return df

In [69]:
telco_df = prep_telco_data(telco_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_charges'] = df.total_charges.astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gender_encoded'] = df.gender.map({'Female': 1, 'Male': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['partner_encoded'] = df.partner.map({'Yes': 1, 'No': 0})
A value is trying to be set o

In [70]:
telco_df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,streaming_tv_Yes,streaming_movies_No internet service,streaming_movies_Yes,contract_type_One year,contract_type_Two year,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,...,1,0,0,1,0,0,0,0,0,1
1,Male,0,No,No,9,Yes,Yes,No,No,No,...,0,0,1,0,0,0,0,0,0,1
2,Male,0,No,No,4,Yes,No,No,No,Yes,...,0,0,0,0,0,1,0,0,1,0
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,...,1,0,1,0,0,1,0,0,1,0
4,Female,1,Yes,No,3,Yes,No,No,No,No,...,1,0,0,0,0,1,0,0,0,1


In [71]:
#Drop Columns not needed for modeling
telco_df.drop(['gender','partner','dependents','phone_service','multiple_lines',\
                              'online_security', \
                              'online_backup', \
                              'device_protection', \
                              'tech_support', \
                              'streaming_tv', \
                              'streaming_movies', \
                              'contract_type', \
                              'internet_service_type', \
                              'payment_type', \
                              'paperless_billing', \
                              'churn'], axis=1, inplace=True)

In [73]:
telco_df.head().T

Unnamed: 0,0,1,2,3,4
senior_citizen,0.0,0.0,0.0,1.0,1.0
tenure,9.0,9.0,4.0,13.0,3.0
monthly_charges,65.6,59.9,73.9,98.0,83.9
total_charges,593.3,542.4,280.85,1237.85,267.4
gender_encoded,1.0,0.0,0.0,0.0,1.0
partner_encoded,1.0,0.0,0.0,1.0,1.0
dependents_encoded,1.0,0.0,0.0,0.0,0.0
phone_service_encoded,1.0,1.0,1.0,1.0,1.0
paperless_billing_encoded,1.0,0.0,1.0,1.0,1.0
churn_encoded,0.0,0.0,1.0,1.0,1.0


### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [74]:
# split into train, validate, test
train, validate, test = train_validate_test_split(telco_df, target='churn_encoded', seed=123)

In [78]:
# create X & y version of train, where y is a series with just the target variable and X are all the features.
X_train = train.drop(columns=['churn_encoded'])
y_train = train.churn_encoded
train.churn_encoded

5919    0
1915    0
5054    0
2355    0
6279    0
       ..
394     0
3763    1
3444    0
1002    0
4192    0
Name: churn_encoded, Length: 3937, dtype: int64

In [77]:
#check shape
train.shape
validate.shape
test.shape

(1407, 31)

In [79]:
#Create validate and test data
#Validate and Test 
X_validate = validate.drop(columns=['churn_encoded'])
y_validate = validate.churn_encoded

X_test = test.drop(columns=['churn_encoded'])
y_test = test.churn_encoded

In [80]:
#Find Baseline
y_train.value_counts()
#Baseline is 0, customer did not churn

0    2891
1    1046
Name: churn_encoded, dtype: int64

In [81]:
#Baseline Accuracy
(y_train == 0).mean()

0.7343154686309372

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [83]:
tree1 = DecisionTreeClassifier(max_depth=3, random_state=123)

In [84]:
# model.fit(X, y)

tree1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=123)

In [85]:
print(export_text(tree1, feature_names=X_train.columns.tolist()))

|--- tenure <= 15.50
|   |--- internet_service_type_Fiber optic <= 0.50
|   |   |--- tenure <= 3.50
|   |   |   |--- class: 0
|   |   |--- tenure >  3.50
|   |   |   |--- class: 0
|   |--- internet_service_type_Fiber optic >  0.50
|   |   |--- total_charges <= 124.47
|   |   |   |--- class: 1
|   |   |--- total_charges >  124.47
|   |   |   |--- class: 1
|--- tenure >  15.50
|   |--- internet_service_type_Fiber optic <= 0.50
|   |   |--- contract_type_Two year <= 0.50
|   |   |   |--- class: 0
|   |   |--- contract_type_Two year >  0.50
|   |   |   |--- class: 0
|   |--- internet_service_type_Fiber optic >  0.50
|   |   |--- tenure <= 49.50
|   |   |   |--- class: 0
|   |   |--- tenure >  49.50
|   |   |   |--- class: 0

