# Logistic Regression Exercises

In [25]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import acquire
import prepare
import env

In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Create a new notebook, logistic_regression, use it to answer the following questions:

In [2]:
df = acquire.get_titanic_data()

In [3]:
df = prepare.prep_titanic(df)

In [4]:
train, validate, test = prepare.split_data(df)

In [5]:
train.shape, validate.shape, test.shape

((426, 13), (143, 13), (143, 13))

In [6]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
398,398,0,2,male,23.0,0,0,10.5,Southampton,1,1,0,1
150,150,0,2,male,51.0,0,0,12.525,Southampton,1,1,0,1
744,744,1,3,male,31.0,0,0,7.925,Southampton,1,1,0,1
58,58,1,2,female,5.0,1,2,27.75,Southampton,0,0,0,1
365,365,0,3,male,30.0,0,0,7.25,Southampton,1,1,0,1


In [7]:
baseline_accuracy = (train.survived == 0).mean()
baseline_accuracy

0.6150234741784038

In [32]:
x_train = train.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','sex_male','embark_town_Queenstown','embark_town_Southampton'])
y_train = train.survived

x_val = validate.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','sex_male','embark_town_Queenstown','embark_town_Southampton'])
y_val = validate.survived

x_test = test.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','sex_male','embark_town_Queenstown','embark_town_Southampton'])
y_test = test.survived

## 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?


In [33]:
logit = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=311, intercept_scaling=1, solver='lbfgs')
logit.fit(x_train, y_train)

In [10]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-1.07342003 -0.03208813  0.00851156]]
Intercept: 
 [7.13950962]


In [11]:
y_pred = logit.predict(x_train)
y_predict_proba = logit.predict_proba(x_train)

In [16]:
prepare.evaluate_clf(logit, x_train, y_train, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.38497652582159625,
           Pred 0  Pred 1
 Actual 0       0     262
 Actual 1       0     164,
                0           1  accuracy   macro avg  weighted avg
 precision    0.0    0.384977  0.384977    0.192488      0.148207
 recall       0.0    1.000000  0.384977    0.500000      0.384977
 f1-score     0.0    0.555932  0.384977    0.277966      0.214021
 support    262.0  164.000000  0.384977  426.000000    426.000000,
                 metric       score
 0             accuracy    0.384977
 1   true_positive_rate    1.000000
 2  false_positive_rate    1.000000
 3   true_negative_rate    0.000000
 4  false_negative_rate    0.000000
 5            precision    0.384977
 6               recall    1.000000
 7             f1_score    0.555932
 8          support_pos  164.000000
 9          support_neg  262.000000)

## 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [39]:
x_train = train.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','embark_town_Queenstown','embark_town_Southampton'])
y_train = train.survived

x_val = validate.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','embark_town_Queenstown','embark_town_Southampton'])
y_val = validate.survived

x_test = test.drop(columns=['passenger_id','survived','sex','sibsp','parch','embark_town','alone','embark_town_Queenstown','embark_town_Southampton'])
y_test = test.survived

In [40]:
logit2 = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=311, intercept_scaling=1, solver='lbfgs')
logit2.fit(x_train, y_train)

In [21]:
y_pred = logit2.predict(x_train)
y_predict_proba = logit2.predict_proba(x_train)

In [22]:
prepare.evaluate_clf(logit2, x_train, y_train, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.38497652582159625,
           Pred 0  Pred 1
 Actual 0       0     262
 Actual 1       0     164,
                0           1  accuracy   macro avg  weighted avg
 precision    0.0    0.384977  0.384977    0.192488      0.148207
 recall       0.0    1.000000  0.384977    0.500000      0.384977
 f1-score     0.0    0.555932  0.384977    0.277966      0.214021
 support    262.0  164.000000  0.384977  426.000000    426.000000,
                 metric       score
 0             accuracy    0.384977
 1   true_positive_rate    1.000000
 2  false_positive_rate    1.000000
 3   true_negative_rate    0.000000
 4  false_negative_rate    0.000000
 5            precision    0.384977
 6               recall    1.000000
 7             f1_score    0.555932
 8          support_pos  164.000000
 9          support_neg  262.000000)

## 3. Try out other combinations of features and models.

This model includes sex, age, fare, and alone features. 

In [42]:
x_train = train.drop(columns=['passenger_id','survived','sex','sibsp','parch','pclass','alone','embark_town'])
y_train = train.survived

x_val = validate.drop(columns=['passenger_id','survived','sex','sibsp','parch','pclass','alone','embark_town'])
y_val = validate.survived

x_test = test.drop(columns=['passenger_id','survived','sex','sibsp','parch','pclass','alone','embark_town'])
y_test = test.survived

In [43]:
logit3 = LogisticRegression(C=1, class_weight={0:1, 1:99}, random_state=311, intercept_scaling=1, solver='lbfgs')
logit3.fit(x_train, y_train)

In [28]:
y_pred = logit3.predict(x_train)
y_predict_proba = logit3.predict_proba(x_train)

In [29]:
prepare.evaluate_clf(logit3, x_train, y_train, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.38497652582159625,
           Pred 0  Pred 1
 Actual 0       0     262
 Actual 1       0     164,
                0           1  accuracy   macro avg  weighted avg
 precision    0.0    0.384977  0.384977    0.192488      0.148207
 recall       0.0    1.000000  0.384977    0.500000      0.384977
 f1-score     0.0    0.555932  0.384977    0.277966      0.214021
 support    262.0  164.000000  0.384977  426.000000    426.000000,
                 metric       score
 0             accuracy    0.384977
 1   true_positive_rate    1.000000
 2  false_positive_rate    1.000000
 3   true_negative_rate    0.000000
 4  false_negative_rate    0.000000
 5            precision    0.384977
 6               recall    1.000000
 7             f1_score    0.555932
 8          support_pos  164.000000
 9          support_neg  262.000000)

## 4. Use you best 3 models to predict and evaluate on your validate sample

In [38]:
y_pred1 = logit.predict(x_val)
y_predict_proba1 = logit.predict_proba(x_val)

In [41]:
y_pred2 = logit2.predict(x_val)
y_predict_proba2 = logit2.predict_proba(x_val)

In [44]:
y_pred3 = logit3.predict(x_val)
y_predict_proba3 = logit3.predict_proba(x_val)

In [45]:
print(classification_report(y_val, y_pred1))

print(classification_report(y_val, y_pred2))

print(classification_report(y_val, y_pred2))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        76
           1       0.47      1.00      0.64        67

    accuracy                           0.47       143
   macro avg       0.23      0.50      0.32       143
weighted avg       0.22      0.47      0.30       143

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        76
           1       0.47      1.00      0.64        67

    accuracy                           0.47       143
   macro avg       0.23      0.50      0.32       143
weighted avg       0.22      0.47      0.30       143

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        76
           1       0.47      1.00      0.64        67

    accuracy                           0.47       143
   macro avg       0.23      0.50      0.32       143
weighted avg       0.22      0.47      0.30       143



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
