# Logistic Regression Exercises

In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic_data

In [2]:
#loading the prepped titanic data and 
#splitting it into train, validate, and test 
train, validate, test = prep_titanic_data()
print("train: ", train.shape, ", validate: ", validate.shape, ", test: ", test.shape)

train:  (497, 10) , validate:  (214, 10) , test:  (178, 10)


In [3]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
583,0,1,36.0,0,0,40.125,1,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,1,0,1
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0


## Exercise 1

Start by defining your baseline model.

#### Base Model

In [4]:
#making a baseline model
train.survived.value_counts(normalize=True)

0    0.617706
1    0.382294
Name: survived, dtype: float64

In [5]:
#taking a peak at the data
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
583,0,1,36.0,0,0,40.125,1,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,1,0,1
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,29.916875,1,0,146.5208,0,0,0,0


#### Baseline Model from Curriculum

In [6]:
#creating an example model based of the curriculum
X_train = train[['pclass', 'age', 'fare', 'sibsp', 'parch']]
y_train = train[['survived']]

In [7]:
#calling the Logistic Regression function and saving it 
#under the variable called logit for shorthand
logit = LogisticRegression()

In [8]:
#fitting the train dataframe into a logistic regression model
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
#printing the coefficients of each category 
#along with the intercept of the function
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.98505432 -0.02975293  0.00233927 -0.17750706  0.32613578]]
Intercept: 
 [2.49738603]


In [10]:
# 'logit.predict' predicts class labels for samples in the parenthesis
y_pred = logit.predict(X_train)
# 'predict_prob' predicts probability estimates
y_pred_proba = logit.predict_proba(X_train)

In [11]:
# 'logit.score' returns the mean accuracy on the given test data and labels.
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.71


In [12]:
#creates a confusion matrix to see how accurate the model is
print(confusion_matrix(y_train, y_pred))

[[262  45]
 [100  90]]


In [13]:
#classification report to get all scores in an easy to read table
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.85      0.78       307
           1       0.67      0.47      0.55       190

    accuracy                           0.71       497
   macro avg       0.70      0.66      0.67       497
weighted avg       0.70      0.71      0.70       497



For all of the models you create, choose a threshold that optimizes for accuracy.

Do your work for these exercises in either a notebook or a python script named model within your classification-exercises repository. Add, commit, and push your work.

## Exercise 2
Create another model that includes age in addition to fare and pclass. 

### Model 1

In [14]:
#changing the parameters for another model
X1_train = train[['age', 'fare', 'pclass']]
y1_train = train[['survived']]

In [15]:
#fitting the data into a logisti regression model
logit.fit(X1_train, y1_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
#printing the coefficients and intercepts of the model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.03051881  0.00266519 -0.97983178]]
Intercept: 
 [2.52970125]


In [17]:
# 'logit.predict' predicts class labels for samples in the parenthesis
y1_pred = logit.predict(X1_train)
# 'predict_prob' predicts probability estimates
y1_pred_proba = logit.predict_proba(X1_train)

In [18]:
# 'logit.score' returns the mean accuracy on the given test data and labels.
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X1_train, y1_train)))

Accuracy of Logistic Regression classifier on training set: 0.72


In [19]:
#creates a confusion matrix to see how accurate the model is
print(confusion_matrix(y1_train, y1_pred))

[[265  42]
 [ 99  91]]


In [20]:
#classification report to get all scores in an easy to read table
print(classification_report(y1_train, y1_pred))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79       307
           1       0.68      0.48      0.56       190

    accuracy                           0.72       497
   macro avg       0.71      0.67      0.68       497
weighted avg       0.71      0.72      0.70       497



- Does this model perform better than your previous one?

It performed slightly better than the previous model. It seems that the coefficients of 'sibsp' and 'parch' did not have much of an effect on the model.

## Exercise 3
Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

### Model 2

In [21]:
#Rinse and repeat for the next few models to test different variables
X2_train = train[['age', 'fare', 'pclass', 'sex_male']]
y2_train = train[['survived']]

In [22]:
logit.fit(X2_train, y2_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-2.66594879e-02  9.02716903e-04 -1.11402368e+00 -2.45878213e+00]]
Intercept: 
 [4.30664987]


In [24]:
y2_pred = logit.predict(X2_train)
y2_pred_proba = logit.predict_proba(X2_train)

In [25]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X2_train, y2_train)))

Accuracy of Logistic Regression classifier on training set: 0.80


In [26]:
print(confusion_matrix(y2_train, y2_pred))

[[263  44]
 [ 56 134]]


In [27]:
print(classification_report(y2_train, y2_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.71      0.73       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.80      0.80      0.80       497



## Exercise 4
Try out other combinations of features and models.

### Model 3

In [28]:
X3_train = train[['age', 'fare', 'pclass', 'embarked_Q', 'embarked_S']]
y3_train = train[['survived']]

In [29]:
logit.fit(X3_train, y3_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.03100806  0.00234461 -1.02568769  0.54782748 -0.15070561]]
Intercept: 
 [2.72357442]


In [31]:
y3_pred = logit.predict(X3_train)
y3_pred_proba = logit.predict_proba(X3_train)

In [32]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X3_train, y3_train)))

Accuracy of Logistic Regression classifier on training set: 0.71


In [33]:
print(confusion_matrix(y3_train, y3_pred))

[[266  41]
 [101  89]]


In [34]:
print(classification_report(y3_train, y3_pred))

              precision    recall  f1-score   support

           0       0.72      0.87      0.79       307
           1       0.68      0.47      0.56       190

    accuracy                           0.71       497
   macro avg       0.70      0.67      0.67       497
weighted avg       0.71      0.71      0.70       497



### Model 4

In [35]:
X4_train = train[['age', 'fare', 'pclass', 'alone']]
y4_train = train[['survived']]

In [36]:
logit.fit(X4_train, y4_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-2.41114976e-02  7.86427878e-04 -9.59730818e-01 -7.85463207e-01]]
Intercept: 
 [2.81126777]


In [38]:
y4_pred = logit.predict(X4_train)
y4_pred_proba = logit.predict_proba(X4_train)

In [39]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X4_train, y4_train)))

Accuracy of Logistic Regression classifier on training set: 0.72


In [40]:
print(confusion_matrix(y4_train, y4_pred))

[[260  47]
 [ 94  96]]


In [41]:
print(classification_report(y4_train, y4_pred))

              precision    recall  f1-score   support

           0       0.73      0.85      0.79       307
           1       0.67      0.51      0.58       190

    accuracy                           0.72       497
   macro avg       0.70      0.68      0.68       497
weighted avg       0.71      0.72      0.71       497



### Model 5

In [42]:
X5_train = train[['age', 'fare', 'sex_male', 'alone']]
y5_train = train[['survived']]

In [43]:
logit.fit(X5_train, y5_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-2.10571953e-03  9.81503198e-03 -2.25962521e+00 -1.90950778e-01]]
Intercept: 
 [0.78850085]


In [45]:
y5_pred = logit.predict(X5_train)
y5_pred_proba = logit.predict_proba(X5_train)

In [46]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X5_train, y5_train)))

Accuracy of Logistic Regression classifier on training set: 0.78


In [47]:
print(confusion_matrix(y5_train, y5_pred))

[[261  46]
 [ 63 127]]


In [48]:
print(classification_report(y5_train, y5_pred))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       307
           1       0.73      0.67      0.70       190

    accuracy                           0.78       497
   macro avg       0.77      0.76      0.76       497
weighted avg       0.78      0.78      0.78       497



### Model 6

In [49]:
X6_train = train[['age', 'pclass', 'sex_male', 'alone']]
y6_train = train[['survived']]

In [50]:
logit.fit(X6_train, y6_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.02570129 -1.12720398 -2.41479961 -0.17176794]]
Intercept: 
 [4.4084933]


In [52]:
y6_pred = logit.predict(X6_train)
y6_pred_proba = logit.predict_proba(X6_train)

In [53]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X6_train, y6_train)))

Accuracy of Logistic Regression classifier on training set: 0.80


In [54]:
print(confusion_matrix(y6_train, y6_pred))

[[264  43]
 [ 58 132]]


In [55]:
print(classification_report(y6_train, y6_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.75      0.69      0.72       190

    accuracy                           0.80       497
   macro avg       0.79      0.78      0.78       497
weighted avg       0.79      0.80      0.80       497



Displaying the accuracy of all six models from above
- Model 1: .72
- Model 2: .80
- Model 3: .71
- Model 4: .72
- Model 5: .78
- Model 6: .80

## Exercise 5
Use you best 3 models to predict and evaluate on your validate sample.

The 3 best models are model 2, 5 and 6.

In [56]:
#recreating the 3 best training variables from the train models
#under the validate datasets to retest with new data
X2_validate = validate[['age', 'fare', 'pclass', 'sex_male']]
y2_validate = validate[['survived']]

X5_validate = validate[['age', 'fare', 'sex_male', 'alone']]
y5_validate = validate[['survived']]

X6_validate = validate[['age', 'pclass', 'sex_male', 'alone']]
y6_validate = validate[['survived']]

In [57]:
logit.fit(X2_validate, y2_validate)
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.04587951  0.00467339 -1.12012323 -1.88320059]]
Intercept: 
 [4.39626524]


In [58]:
logit.fit(X5_validate, y5_validate)
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.02817128  0.01992692 -1.85331673  0.38786936]]
Intercept: 
 [0.6207659]


In [59]:
logit.fit(X6_validate, y6_validate)
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.04655122 -1.23266078 -1.95035015  0.11929282]]
Intercept: 
 [4.78000967]


In [60]:
y_pred2 = logit.predict(X2_validate)
y2_pred_proba = logit.predict_proba(X2_validate)
y_pred5 = logit.predict(X5_validate)
y5_pred_proba = logit.predict_proba(X5_validate)
y_pred6 = logit.predict(X6_validate)
y6_pred_proba = logit.predict_proba(X6_validate)

In [61]:
# printing out the mean accuracy on the given test data and labels
print("model 2\n", logit.score(X2_validate, y2_validate))
print("model 5\n", logit.score(X5_validate, y5_validate))
print("model 6\n", logit.score(X6_validate, y6_validate))

model 2
 0.6074766355140186
model 5
 0.5981308411214953
model 6
 0.7850467289719626


In [62]:
#printing out a confusion matrix for all models
print("model 2\n", confusion_matrix(y2_validate, y_pred2))
print("model 5\n", confusion_matrix(y5_validate, y_pred5))
print("model 6\n", confusion_matrix(y6_validate, y_pred6))

model 2
 [[130   2]
 [ 82   0]]
model 5
 [[128   4]
 [ 82   0]]
model 6
 [[115  17]
 [ 29  53]]


In [63]:
#printing out a classification report for all models
print("model 2\n", classification_report(y2_validate, y_pred2))
print("model 5\n", classification_report(y5_validate, y_pred5))
print("model 6\n", classification_report(y6_validate, y_pred6))

model 2
               precision    recall  f1-score   support

           0       0.61      0.98      0.76       132
           1       0.00      0.00      0.00        82

    accuracy                           0.61       214
   macro avg       0.31      0.49      0.38       214
weighted avg       0.38      0.61      0.47       214

model 5
               precision    recall  f1-score   support

           0       0.61      0.97      0.75       132
           1       0.00      0.00      0.00        82

    accuracy                           0.60       214
   macro avg       0.30      0.48      0.37       214
weighted avg       0.38      0.60      0.46       214

model 6
               precision    recall  f1-score   support

           0       0.80      0.87      0.83       132
           1       0.76      0.65      0.70        82

    accuracy                           0.79       214
   macro avg       0.78      0.76      0.77       214
weighted avg       0.78      0.79      0.78    

## Exercise 6
Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [64]:
#recreating the best train and validate model variables
#under the test datasets to run a final test
X6_test = test[['age', 'pclass', 'sex_male', 'alone']]
y6_test = test[['survived']]

In [65]:
logit.fit(X6_test, y6_test)
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.03826895 -0.83611527 -2.63666645  0.30714706]]
Intercept: 
 [3.92034512]


In [66]:
# 'logit.predict' predicts class labels for validate samples in the parenthesis
y_pred = logit.predict(X6_test)
# 'predict_proba' creates probability estimates
y_pred_proba = logit.predict_proba(X6_test)

# print the mean accuracy on the given test data and labels.
accuracy = logit.score(X6_test, y6_test)
print(accuracy)

#print the confusion matrix and classification report for final analysis
print(confusion_matrix(y6_test, y_pred))
print(classification_report(y6_test, y_pred))

0.8258426966292135
[[96 14]
 [17 51]]
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       110
           1       0.78      0.75      0.77        68

    accuracy                           0.83       178
   macro avg       0.82      0.81      0.81       178
weighted avg       0.82      0.83      0.83       178



The performance metrics stayed relaively the same throughout the train, validate and test stage. This would be a great model to predict the survival rate since it is roughly 20 percentage points above the baseline. 

## Review example

#### Train

In [67]:
X_train_ex = train.drop(columns=['age', 'fare', 'survived'])
y_train_ex = train[['survived']]

In [68]:
#fitting the data into a logisti regression model
logit.fit(X_train_ex, y_train_ex)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [69]:
#printing the coefficients of each category 
#along with the intercept of the function
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.89320109 -0.47113497 -0.14990043 -1.01531958 -2.41796105  0.37728737
  -0.0155336 ]]
Intercept: 
 [3.8948907]


In [70]:
# 'logit.predict' predicts class labels for samples in the parenthesis
y_pred_ex = logit.predict(X_train_ex)
# 'predict_prob' predicts probability estimates
y_pred_proba_ex = logit.predict_proba(X_train_ex)

In [71]:
# 'logit.score' returns the mean accuracy on the given test data and labels.
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train_ex, y_train_ex)))

Accuracy of Logistic Regression classifier on training set: 0.79


In [72]:
#creates a confusion matrix to see how accurate the model is
print(confusion_matrix(y_train_ex, y_pred_ex))

[[262  45]
 [ 57 133]]


In [73]:
#classification report to get all scores in an easy to read table
print(classification_report(y_train_ex, y_pred_ex))

              precision    recall  f1-score   support

           0       0.82      0.85      0.84       307
           1       0.75      0.70      0.72       190

    accuracy                           0.79       497
   macro avg       0.78      0.78      0.78       497
weighted avg       0.79      0.79      0.79       497



#### Validate

In [74]:
X_validate_ex = validate.drop(columns=['age', 'fare', 'survived'])
y_validate_ex = validate[['survived']]

In [75]:
#fitting the data into a logisti regression model
logit.fit(X_validate_ex, y_validate_ex)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [76]:
#printing the coefficients of each category 
#along with the intercept of the function
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.87303369 -0.39371458 -0.17907146 -0.66087797 -1.98658939  0.09322755
  -0.35107385]]
Intercept: 
 [3.6238827]


In [77]:
# 'logit.predict' predicts class labels for samples in the parenthesis
y_pred_ex = logit.predict(X_validate_ex)
# 'predict_prob' predicts probability estimates
y_pred_proba_ex = logit.predict_proba(X_validate_ex)

In [78]:
# 'logit.score' returns the mean accuracy on the given test data and labels.
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_validate_ex, y_validate_ex)))

Accuracy of Logistic Regression classifier on training set: 0.80


In [79]:
#creates a confusion matrix to see how accurate the model is
print(confusion_matrix(y_validate_ex, y_pred_ex))

[[117  15]
 [ 27  55]]


In [80]:
#classification report to get all scores in an easy to read table
print(classification_report(y_validate_ex, y_pred_ex))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85       132
           1       0.79      0.67      0.72        82

    accuracy                           0.80       214
   macro avg       0.80      0.78      0.79       214
weighted avg       0.80      0.80      0.80       214



#### Test: Use only one model for test

In [81]:
X_test_ex = test.drop(columns=['age', 'fare', 'survived'])
y_test_ex = test[['survived']]

In [82]:
#fitting the data into a logisti regression model
logit.fit(X_test_ex, y_test_ex)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [83]:
#printing the coefficients of each category 
#along with the intercept of the function
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.57492301 -0.15517538 -0.13565495 -0.04940023 -2.57086937 -0.72527695
  -1.10042851]]
Intercept: 
 [3.3083142]


In [84]:
# 'logit.predict' predicts class labels for samples in the parenthesis
y_pred_ex = logit.predict(X_test_ex)
# 'predict_prob' predicts probability estimates
y_pred_proba_ex = logit.predict_proba(X_test_ex)

In [85]:
# 'logit.score' returns the mean accuracy on the given test data and labels.
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_test_ex, y_test_ex)))

Accuracy of Logistic Regression classifier on training set: 0.83


In [86]:
#creates a confusion matrix to see how accurate the model is
print(confusion_matrix(y_test_ex, y_pred_ex))

[[94 16]
 [15 53]]


In [87]:
#classification report to get all scores in an easy to read table
print(classification_report(y_test_ex, y_pred_ex))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86       110
           1       0.77      0.78      0.77        68

    accuracy                           0.83       178
   macro avg       0.82      0.82      0.82       178
weighted avg       0.83      0.83      0.83       178



# Decision Tree Exercises

In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [88]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

## Exercise 1

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [89]:
#Prepping data for a decision tree
X_train = train.drop(columns=['survived'])
y_train = train[['survived']]

In [90]:
#setting variable to decision tree classifier
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [91]:
#fitting the data to the model
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [92]:
# 'logit.predict' predicts class labels for samples in the parenthesis
y_pred = clf.predict(X_train)
# 'predict_proba' predicts porbability estimates
y_pred_proba = clf.predict_proba(X_train)

## Exercise 2

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [93]:
#printing accuracy of the model
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [94]:
#creating a confusion matrix
confusion_matrix(y_train, y_pred)

array([[279,  28],
       [ 62, 128]])

In [95]:
#creating labels for the confusion matrix
labels = ['did not survive', 'survive']

#creating a confusion matrix and saving it as a dataframe
cm = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)
cm

Unnamed: 0,did not survive,survive
did not survive,279,28
survive,62,128


In [96]:
#creating a classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497



## Exercise 3

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [97]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [98]:
#identifying and saving the confusion matrix variables 
TP = cm.iloc[0,0]
FN = cm.iloc[0,1]
FP = cm.iloc[1,0]
TN = cm.iloc[1,1]
TP, FN, FP, TN

(279, 28, 62, 128)

In [99]:
#creating labels for the classification report
target_names = ['did not survive', 'survive']

#creating the classification report
x = classification_report(y_train, y_pred, target_names=target_names, output_dict=True)

#saving the report as a dataframe
class_report = pd.DataFrame(x)
class_report

Unnamed: 0,did not survive,survive,accuracy,macro avg,weighted avg
precision,0.818182,0.820513,0.818913,0.819347,0.819073
recall,0.908795,0.673684,0.818913,0.791239,0.818913
f1-score,0.861111,0.739884,0.818913,0.800498,0.814767
support,307.0,190.0,0.818913,497.0,497.0


In [100]:
#True pos rate
TP_rate = round(TP / (TP + FN),3)
#False pos rate
FP_rate = round(FP / (FP + TN),3)
#True neg rate
TN_rate = round(TN / (TN + FP),3)
#False neg rate
FN_rate = round(FN / (FN + TP),3)

accuracy = round(accuracy_score(y_true = y_train, y_pred = y_pred),3)
precision = round(precision_score(y_true = y_train, y_pred = y_pred),3)
recall = round(recall_score(y_true = y_train, y_pred = y_pred),3)
f1score = round(f1_score(y_true = y_train, y_pred = y_pred),3)

In [101]:
print(f'True Pos Rate: {TP_rate}')
print(f'False Pos Rate: {FP_rate}')
print(f'True Neg Rate: {TN_rate}')
print(f'False Pos Rate: {FP_rate}')

print('\n')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1score}')

True Pos Rate: 0.909
False Pos Rate: 0.326
True Neg Rate: 0.674
False Pos Rate: 0.326


Accuracy: 0.819
Precision: 0.821
Recall: 0.674
F1-score: 0.74


# Exercise 4

Run through steps 2-4 using a different max_depth value.

In [102]:
clf2 = DecisionTreeClassifier(max_depth=9, random_state=123)

In [103]:
clf2.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [104]:
y_pred2 = clf2.predict(X_train)
y_pred_proba2 = clf2.predict_proba(X_train)

In [105]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.91


In [106]:
cm2 = pd.DataFrame(confusion_matrix(y_train, y_pred2), index=labels, columns=labels)
cm2

Unnamed: 0,did not survive,survive
did not survive,294,13
survive,30,160


In [107]:
TP = cm2.iloc[0,0]
FN = cm2.iloc[0,1]
FP = cm2.iloc[1,0]
TN = cm2.iloc[1,1]
TP, FP, FN, TN

(294, 30, 13, 160)

In [108]:
x2 = classification_report(y_train, y_pred2, target_names=target_names, output_dict=True)
class_report2 = pd.DataFrame(x2)
class_report2

Unnamed: 0,did not survive,survive,accuracy,macro avg,weighted avg
precision,0.907407,0.924855,0.913481,0.916131,0.914078
recall,0.957655,0.842105,0.913481,0.89988,0.913481
f1-score,0.931854,0.881543,0.913481,0.906698,0.91262
support,307.0,190.0,0.913481,497.0,497.0


In [109]:
TP_rate = round(TP / (TP + FN),3)
FP_rate = round(FP / (FP + TN),3)
TN_rate = round(TN / (TN + FP),3)
FN_rate = round(FN / (FN + TP),3)
accuracy = round(accuracy_score(y_true = y_train, y_pred = y_pred2),3)
precision = round(precision_score(y_true = y_train, y_pred = y_pred2),3)
recall = round(recall_score(y_true = y_train, y_pred = y_pred2),3)
f1score = round(f1_score(y_true = y_train, y_pred = y_pred2),3)

In [110]:
print(f'True Pos Rate: {TP_rate}')
print(f'False Pos Rate: {FP_rate}')
print(f'True Neg Rate: {TN_rate}')
print(f'False Pos Rate: {FP_rate}')

print('\n')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1score}')

True Pos Rate: 0.958
False Pos Rate: 0.158
True Neg Rate: 0.842
False Pos Rate: 0.158


Accuracy: 0.913
Precision: 0.925
Recall: 0.842
F1-score: 0.882


## Exercise 5

Which performs better on your in-sample data?

The decision tree with a higher max depth value performed better on my in_sample data but it may not be the best data for other sets. 

In [164]:
## creating a function that sets max_depth number quickly
def max_depth_number(depth_number):
    #impporting libraries needed
    from sklearn.tree import DecisionTreeClassifier
    #calling function
    clf = DecisionTreeClassifier(max_depth= depth_number, random_state=123)
    return clf

In [187]:
## creating a function to perform decision trees quicker
def decision_tree(X_train, y_train, depth_number):
    #import libraries needed
    import pandas as pd
    from sklearn.metrics import classification_report
    from sklearn.metrics import confusion_matrix
    
    #setting max depth number for DecisionTreeClassifier
    clf = max_depth_number(depth_number)
    
    #fitting the data to the model
    clf.fit(X_train, y_train)
    # 'logit.predict' predicts class labels for samples in the parenthesis
    y_pred = clf.predict(X_train)
    # 'predict_proba' predicts porbability estimates
    y_pred_proba = clf.predict_proba(X_train)
    #creating a confusion matrix and storing it in a DataFrame
    cm = pd.DataFrame(confusion_matrix(y_train, y_pred))
    #creating a copy of y_train
    label = y_train
    #renaming column in copy of y_train
    label = label.rename(columns={label.columns[0]:'label'})
    #creating labels out of unique values for 
    labels = sorted(label.label.unique())
    #creating a classification report and saving it as a DataFrame
    class_report = pd.DataFrame(classification_report(y_train, y_pred, target_names=labels, output_dict=True))
    
    return cm, class_report

In [113]:
def confusion_matrix_rates(cm):
    TP = cm[0][0]
    FN = cm[0][1]
    FP = cm[1][0]
    TN = cm[1][1]
    TPrate = round(TP / (TP + FN),3)
    FPrate = round(FP / (FP + TN),3)
    TNrate = round(TN / (TN + FP),3)
    FNrate = round(FN / (FN + TP),3)
    return TPrate, FPrate, FNrate, TNrate

In [194]:
cm50, report50 = decision_tree(X_train, y_train, 10)

In [195]:
report50

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.933962,0.944134,0.937626,0.939048,0.937851
recall,0.967427,0.889474,0.937626,0.92845,0.937626
f1-score,0.9504,0.915989,0.937626,0.933195,0.937245
support,307.0,190.0,0.937626,497.0,497.0


In [196]:
cm50

Unnamed: 0,0,1
0,297,10
1,21,169


In [None]:
# dot_data = export_graphviz(model, feature_names= X.columns, class_names= {0:'not survived', 1:'survived'}, rounded=True, filled=True, out_file=None)