In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

import os
from env import get_connection

import acquire
import prepare

#### Create a new notebook, logistic_regression, use it to answer the following questions:

1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

3. Try out other combinations of features and models.

4. Use you best 3 models to predict and evaluate on your validate sample.

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [2]:
# acquire data using function from acquire
titanic = acquire.get_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
def prep_titanic(df):
    
    df.drop(columns = ['embarked','deck', 'passenger_id', 'sex', 'sibsp', 'parch', 'embarked', 'class', 'deck','embark_town', 'alone'], inplace=True )
    df.fillna(value =0, inplace=True)
    return df

In [4]:
titanic = prep_titanic(titanic)
titanic.head()

Unnamed: 0,survived,pclass,age,fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   age       891 non-null    float64
 3   fare      891 non-null    float64
dtypes: float64(2), int64(2)
memory usage: 28.0 KB


In [6]:
train, val, test = prepare.train_validate_test_split(titanic, 'survived')
train.shape, val.shape, test.shape

((133, 4), (134, 4), (624, 4))

In [7]:
X_train_1 = train.drop(columns = ['survived'])
y_train_1 = train['survived']

X_val_1 =val.drop(columns = ['survived'])
y_val_1 = val['survived']

X_test_1 = test.drop(columns = ['survived'])
y_test_1 = test['survived']

In [8]:
seed = 42

logit = LogisticRegression(random_state = seed, )

In [9]:
logit.fit(X_train_1, y_train_1)

In [10]:
logit.coef_

array([[-0.54766447, -0.00728056,  0.00937862]])

In [11]:
logit.intercept_

array([0.74969737])

In [48]:
train_acc_1 = logit.score(X_train_1, y_train_1)
train_acc_1

0.6466165413533834

In [49]:
train_preds_1 = logit.predict(X_train_1)

In [50]:
val_preds_1 = logit.predict(X_val_1)

In [51]:
report = classification_report(y_train_1, train_preds_1, output_dict=True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.66055,0.583333,0.646617,0.621942,0.630941
recall,0.878049,0.27451,0.646617,0.576279,0.646617
f1-score,0.753927,0.373333,0.646617,0.56363,0.607985
support,82.0,51.0,0.646617,133.0,133.0


In [52]:
train.survived.value_counts()

0    82
1    51
Name: survived, dtype: int64

In [53]:
baseline = y_train_1.mode()
baseline

0    0
Name: survived, dtype: int64

In [54]:

matches_baseline_prediction = (y_train_1 == 0)
matches_baseline_prediction
baseline_accuracy = matches_baseline_prediction.mean()
baseline_accuracy

0.6165413533834586

2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model

In [55]:
titanic2 = acquire.get_titanic_data()

In [56]:
titanic2.drop(columns = ['embarked','deck', 'passenger_id', 'sibsp', 'parch', 'embarked', 'class', 'deck','embark_town', 'alone'], inplace=True )
titanic2.fillna(value =0, inplace=True)

In [57]:
titanic2.head()

Unnamed: 0,survived,pclass,sex,age,fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [58]:
dummies = pd.get_dummies(titanic2['sex'], drop_first=True)
dummies.head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [59]:
titanic2 = pd.concat([titanic2, dummies], axis=1)
titanic2.head()

Unnamed: 0,survived,pclass,sex,age,fare,male
0,0,3,male,22.0,7.25,1
1,1,1,female,38.0,71.2833,0
2,1,3,female,26.0,7.925,0
3,1,1,female,35.0,53.1,0
4,0,3,male,35.0,8.05,1


In [60]:
titanic2.drop(columns='sex', inplace=True)
titanic2.head()

Unnamed: 0,survived,pclass,age,fare,male
0,0,3,22.0,7.25,1
1,1,1,38.0,71.2833,0
2,1,3,26.0,7.925,0
3,1,1,35.0,53.1,0
4,0,3,35.0,8.05,1


In [61]:
train, val, test = prepare.train_validate_test_split(titanic2, 'survived')
train.shape, val.shape, test.shape

((133, 5), (134, 5), (624, 5))

In [62]:
X_train = train.drop(columns = ['survived'])
y_train = train['survived']

X_val =val.drop(columns = ['survived'])
y_val = val['survived']

X_test = test.drop(columns = ['survived'])
y_test = test['survived']

In [63]:
# Model 2
seed = 42

logit2 = LogisticRegression(random_state = seed )

In [64]:
logit2.fit(X_train, y_train)

In [65]:
logit2.coef_

array([[-0.78047515, -0.00917387,  0.00461435, -2.19038021]])

In [66]:
logit2.intercept_

array([2.80752691])

In [68]:
train_acc_2 = logit2.score(X_train, y_train)
train_acc_2

0.7819548872180451

In [69]:
train_preds = logit2.predict(X_train)

In [70]:
val_preds_2 = logit2.predict(X_val)

In [71]:
report = classification_report(y_train, train_preds, output_dict=True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.811765,0.729167,0.781955,0.770466,0.780092
recall,0.841463,0.686275,0.781955,0.763869,0.781955
f1-score,0.826347,0.707071,0.781955,0.766709,0.78061
support,82.0,51.0,0.781955,133.0,133.0


3. Try out other combinations of features and models.

In [72]:
seed = 42

logit3 = LogisticRegression(C = 0.1, random_state = seed, )

In [73]:
logit3.fit(X_train, y_train)

In [74]:
logit3.coef_

array([[-0.32782663, -0.00559145,  0.01240109, -0.93289979]])

In [75]:
logit3.intercept_

array([0.70820107])

In [78]:
train_acc_3 = logit3.score(X_train, y_train)
train_acc_3

0.7368421052631579

In [79]:
train_preds = logit3.predict(X_train)

In [80]:
val_preds_3 = logit3.predict(X_val)

In [81]:
report = classification_report(y_train, train_preds, output_dict=True)
pd.DataFrame(report)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.728155,0.766667,0.736842,0.747411,0.742923
recall,0.914634,0.45098,0.736842,0.682807,0.736842
f1-score,0.810811,0.567901,0.736842,0.689356,0.717665
support,82.0,51.0,0.736842,133.0,133.0


4. Use you best 3 models to predict and evaluate on your validate sample.


In [82]:
acc_val_1 = logit.score(X_val_1, y_val_1)
print('Model 1')
print('val Accuracy 2', acc_val_1)
report = classification_report(y_val_1, val_preds_1, output_dict=True)
pd.DataFrame(report)


Model 1
val Accuracy 2 0.6940298507462687


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.71875,0.631579,0.69403,0.675164,0.685573
recall,0.831325,0.470588,0.69403,0.650957,0.69403
f1-score,0.77095,0.539326,0.69403,0.655138,0.682794
support,83.0,51.0,0.69403,134.0,134.0


In [83]:
val_acc_2 = logit2.score(X_val, y_val)
print('Model 2')
print('val Accuracy 2', val_acc_2)
report = classification_report(y_val, val_preds_2, output_dict=True)
pd.DataFrame(report)

Model 2
val Accuracy 2 0.7089552238805971


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.768293,0.615385,0.708955,0.691839,0.710096
recall,0.759036,0.627451,0.708955,0.693244,0.708955
f1-score,0.763636,0.621359,0.708955,0.692498,0.709486
support,83.0,51.0,0.708955,134.0,134.0


In [84]:
val_acc_3 = logit3.score(X_val, y_val)
print('Model 3')
print('Val Accuracy 3', val_acc_3)
report = classification_report(y_val, val_preds_3, output_dict=True)
pd.DataFrame(report)

Model 3
Val Accuracy 3 0.746268656716418


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.728972,0.814815,0.746269,0.771893,0.761643
recall,0.939759,0.431373,0.746269,0.685566,0.746269
f1-score,0.821053,0.564103,0.746269,0.692578,0.723258
support,83.0,51.0,0.746269,134.0,134.0


5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [85]:
test_preds_3 = logit3.predict(X_test)
test_acc_3 = logit3.score(X_test, y_test)
print('Model 3')
print('Test Accuracy 3', test_acc_3)
report = classification_report(y_test, test_preds_3, output_dict=True)
pd.DataFrame(report)

Model 3
Test Accuracy 3 0.7628205128205128


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.757642,0.777108,0.762821,0.767375,0.765129
recall,0.903646,0.5375,0.762821,0.720573,0.762821
f1-score,0.824228,0.635468,0.762821,0.729848,0.751628
support,384.0,240.0,0.762821,624.0,624.0


In [90]:
print('Train Accuracy model 3 : ', train_acc_3)
print('Val Accuracy model 3   : ', val_acc_3)
print('Test Accuracy model 3  : ', test_acc_3)

Train Accuracy model 3 :  0.7368421052631579
Val Accuracy model 3   :  0.746268656716418
Test Accuracy model 3  :  0.7628205128205128
