In [1]:
import pandas as pd
import numpy as np

import prepare

from sklearn.linear_model import LogisticRegression
from prepare import prep_titanic
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score

seed = 55

### Exercises
In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Create a new notebook, logistic_regression, use it to answer the following questions:



In [2]:
titan = prep_titanic()
titan['age'] = titan['age'].fillna(28)

In [3]:
train , val, test = prepare.train_val_test(titan , 'survived')

In [4]:
baseline = 1 - train.survived.mean()

In [5]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
519,0,3,male,32.0,0,0,7.8958,Southampton,1
441,0,3,male,20.0,0,0,9.5,Southampton,1
43,1,2,female,3.0,1,2,41.5792,Cherbourg,0
341,1,1,female,24.0,3,2,263.0,Southampton,0
664,1,3,male,20.0,1,0,7.925,Southampton,0


#### 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [6]:
x_train = train[['age','fare', 'pclass']]
y_train = train.survived
x_val = val[['age','fare', 'pclass']]
y_val = val.survived

In [7]:
logreg1 = LogisticRegression(random_state = seed, max_iter = 400)

logreg1.fit(x_train, y_train)

logreg1.score(x_train, y_train), logreg1.score(x_val, y_val)

(0.7142857142857143, 0.664179104477612)

They both perform better than baseline but only slightly

#### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [8]:
titan = pd.get_dummies(titan, columns = ['sex'], drop_first = True)

In [9]:
train , val, test = prepare.train_val_test(titan , 'survived')

In [20]:
x_train = train[['age','fare', 'pclass','sex_male']]
y_train = train.survived
x_val = val[['age','fare', 'pclass','sex_male']]
y_val = val.survived

In [21]:
logreg2 = LogisticRegression(random_state = seed, max_iter = 400)

logreg2.fit(x_train, y_train)

logreg2.score(x_train, y_train), logreg2.score(x_val, y_val)

(0.7784911717495987, 0.7761194029850746)

In [22]:
p_train_2 = logreg2.predict(x_train)
p_val_2 = logreg2.predict(x_val)

#### 3. Try out other combinations of features and models.

In [12]:
x_train = train[['pclass', 'alone','sex_male']]
y_train = train.survived
x_val = val[['pclass', 'alone','sex_male']]
y_val = val.survived

In [13]:
logreg3 = LogisticRegression(random_state = seed, max_iter = 400)

logreg3.fit(x_train, y_train)

logreg3.score(x_train, y_train), logreg3.score(x_val, y_val)

(0.7768860353130016, 0.7985074626865671)

Basic with everything but target and port.

In [14]:
x_train = train.drop(columns = ['survived','embark_town'])
y_train = train.survived
x_val = val.drop(columns = ['survived','embark_town'])
y_val = val.survived

In [15]:
logreg4 = LogisticRegression(random_state = seed, max_iter = 400)

logreg4.fit(x_train, y_train)

logreg4.score(x_train, y_train), logreg4.score(x_val, y_val)

(0.8057784911717496, 0.8059701492537313)

Alone and pclass

In [16]:
x_train = train[['pclass', 'alone']]
y_train = train.survived
x_val = val[['pclass', 'alone']]
y_val = val.survived

In [17]:
logreg5 = LogisticRegression(random_state = seed, max_iter = 400)

logreg5.fit(x_train, y_train)

logreg5.score(x_train, y_train), logreg5.score(x_val, y_val)

(0.7030497592295345, 0.6791044776119403)

#### 4. Use you best 3 models to predict and evaluate on your validate sample.

In [None]:
# going to do this above because i don't want to input any more code above, and just call the values here.
# The 

In [19]:
p_train_2 = logreg2.predict(x_train)
p_val_2 = logreg2.predict(x_val)
p_train_3 = logreg3.predict(x_train)
p_val_3 = logreg3.predict(x_val)
p_train_4 = logreg4.predict(x_train)
p_val_4 = logreg4.predict(x_val)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- alone
Feature names seen at fit time, yet now missing:
- age
- fare
- sex_male


#### 6. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?