In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
import sklearn.impute
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare
import split_scale

## Classification Modeling Exercise

In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Do your work for these exercises in either a notebook or a python script named model within your classification directory.

1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?
2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.
3. Try out other combinations of features and models.
4. Choose you best model and evaluate it on the test dataset. Is it overfit?
5. **Bonus**: How do different strategies for handling the missing values in the age column affect model performance?
6. **Bonus**: How do different strategies for encoding sex affect model performance?
7. **Bonus**: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

$C=.01,.1,1,10,100,1000$

- 8. **Bonus Bonus**: how does scaling the data interact with your choice of C?

In [2]:
df = acquire.get_titanic_data()

In [3]:
df = df.drop(columns="deck")

In [4]:
# fix embarktown and embarked

df.embark_town = df.embark_town.fillna('Southampton')
df.embarked = df.embarked.fillna('S')

In [5]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1


### Initial Model

In [6]:
train, test = train_test_split(df, random_state=123, train_size=.8)
train, validate = train_test_split(train, train_size=.8)

X = train[['pclass', 'fare']]
y = train[['survived']]

logit1 = LogisticRegression(random_state=123).fit(X, y)

In [7]:
logit1.predict(X)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [8]:
logit1.predict_proba(X)

array([[0.59452322, 0.40547678],
       [0.74208195, 0.25791805],
       [0.41651013, 0.58348987],
       ...,
       [0.55831014, 0.44168986],
       [0.73450482, 0.26549518],
       [0.35209   , 0.64791   ]])

In [9]:
logit1.score(X, y)

0.6748681898066784

### 1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [10]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
626,626,0,2,male,57.0,0,0,12.35,Q,Second,Queenstown,1
107,107,1,3,male,,0,0,7.775,S,Third,Southampton,1
168,168,0,1,male,,0,0,25.925,S,First,Southampton,1
666,666,0,2,male,25.0,0,0,13.0,S,Second,Southampton,1
25,25,1,3,female,38.0,1,5,31.3875,S,Third,Southampton,0


In [11]:
imputer = sklearn.impute.SimpleImputer(strategy='mean')
imputer.fit(train[['age']])
train.age = imputer.transform(train[['age']])
validate.age = imputer.transform(validate[['age']])
test.age = imputer.transform(test[['age']])

In [12]:
train.isna().sum()

passenger_id    0
survived        0
pclass          0
sex             0
age             0
sibsp           0
parch           0
fare            0
embarked        0
class           0
embark_town     0
alone           0
dtype: int64

In [13]:
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

    test: 179 rows x 12 columns
   train: 569 rows x 12 columns
validate: 143 rows x 12 columns


In [14]:
X = train[['pclass', 'fare', 'age']]
y = train[['survived']]

logit2 = LogisticRegression(random_state=123).fit(X, y)

In [15]:
logit2.predict(X)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [16]:
logit2.predict_proba(X)

array([[0.7650884 , 0.2349116 ],
       [0.7627991 , 0.2372009 ],
       [0.37880114, 0.62119886],
       ...,
       [0.49360192, 0.50639808],
       [0.7556265 , 0.2443735 ],
       [0.37980071, 0.62019929]])

In [17]:
logit2.score(X, y)

0.7047451669595782

In [18]:
X_validate = validate[['pclass', 'fare', 'age']]
y_validate = validate[['survived']]

In [19]:
logit2.score(X_validate, y_validate)

0.6713286713286714

### 2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.

In [20]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
626,626,0,2,male,57.0,0,0,12.35,Q,Second,Queenstown,1
107,107,1,3,male,29.831314,0,0,7.775,S,Third,Southampton,1
168,168,0,1,male,29.831314,0,0,25.925,S,First,Southampton,1
666,666,0,2,male,25.0,0,0,13.0,S,Second,Southampton,1
25,25,1,3,female,38.0,1,5,31.3875,S,Third,Southampton,0


In [21]:
# encode sex for both train and test
le = LabelEncoder()
train['sex_encoder'] = le.fit_transform(train.sex)
validate['sex_encoder'] = le.fit_transform(validate.sex)
test['sex_encoder'] = le.transform(test.sex)

In [22]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,sex_encoder
626,626,0,2,male,57.0,0,0,12.35,Q,Second,Queenstown,1,1
107,107,1,3,male,29.831314,0,0,7.775,S,Third,Southampton,1,1
168,168,0,1,male,29.831314,0,0,25.925,S,First,Southampton,1,1
666,666,0,2,male,25.0,0,0,13.0,S,Second,Southampton,1,1
25,25,1,3,female,38.0,1,5,31.3875,S,Third,Southampton,0,0


In [23]:
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

    test: 179 rows x 13 columns
   train: 569 rows x 13 columns
validate: 143 rows x 13 columns


In [24]:
X = train[['pclass', 'fare', 'age', 'sex_encoder']]
y = train[['survived']]

logit3 = LogisticRegression(random_state=123).fit(X, y)

In [25]:
logit3.predict(X)

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,

In [26]:
logit3.predict_proba(X)

array([[0.85080261, 0.14919739],
       [0.87827917, 0.12172083],
       [0.58733105, 0.41266895],
       ...,
       [0.2291716 , 0.7708284 ],
       [0.45217206, 0.54782794],
       [0.14782348, 0.85217652]])

In [27]:
logit3.score(X, y)

0.7820738137082601

In [28]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder']]
y_validate = validate[['survived']]

logit3.score(X_validate, y_validate)

0.8181818181818182

### 3. Try out other combinations of features and models.

In [29]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,sex_encoder
626,626,0,2,male,57.0,0,0,12.35,Q,Second,Queenstown,1,1
107,107,1,3,male,29.831314,0,0,7.775,S,Third,Southampton,1,1
168,168,0,1,male,29.831314,0,0,25.925,S,First,Southampton,1,1
666,666,0,2,male,25.0,0,0,13.0,S,Second,Southampton,1,1
25,25,1,3,female,38.0,1,5,31.3875,S,Third,Southampton,0,0


In [30]:
# encoding embarked

encoder = sklearn.preprocessing.OneHotEncoder()
encoder.fit(train[["embarked"]])

m1 = encoder.transform(train[["embarked"]]).todense()

train = pd.concat([train, pd.DataFrame(m1, columns=encoder.categories_[0], index=train.index)], axis=1)

m2 = encoder.transform(validate[["embarked"]]).todense()

validate = pd.concat([validate, pd.DataFrame(m2, columns=encoder.categories_[0], index=validate.index)], axis=1)

m3 = encoder.transform(test[["embarked"]]).todense()

test = pd.concat([test, pd.DataFrame(m3, columns=encoder.categories_[0], index=test.index)], axis=1)

In [31]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,sex_encoder,C,Q,S
626,626,0,2,male,57.0,0,0,12.35,Q,Second,Queenstown,1,1,0.0,1.0,0.0
107,107,1,3,male,29.831314,0,0,7.775,S,Third,Southampton,1,1,0.0,0.0,1.0
168,168,0,1,male,29.831314,0,0,25.925,S,First,Southampton,1,1,0.0,0.0,1.0
666,666,0,2,male,25.0,0,0,13.0,S,Second,Southampton,1,1,0.0,0.0,1.0
25,25,1,3,female,38.0,1,5,31.3875,S,Third,Southampton,0,0,0.0,0.0,1.0


In [32]:
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

    test: 179 rows x 16 columns
   train: 569 rows x 16 columns
validate: 143 rows x 16 columns


In [33]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit4 = LogisticRegression(random_state=123).fit(X, y)

In [34]:
logit4.predict(X)

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,

In [35]:
logit4.predict_proba(X)

array([[0.84154407, 0.15845593],
       [0.88921986, 0.11078014],
       [0.60106849, 0.39893151],
       ...,
       [0.17136664, 0.82863336],
       [0.42741093, 0.57258907],
       [0.163135  , 0.836865  ]])

In [36]:
logit4.score(X, y)

0.7803163444639719

In [37]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit4.score(X_validate, y_validate)

0.8181818181818182

### 4. Choose you best model and evaluate it on the test dataset. Is it overfit?

In [38]:
X_test = test[['pclass', 'fare']]
y_test = test[['survived']]

logit1.score(X_test, y_test)

0.7039106145251397

In [39]:
X_test = test[['pclass', 'fare', 'age']]
y_test = test[['survived']]

logit2.score(X_test, y_test)

0.7262569832402235

In [40]:
X_test = test[['pclass', 'fare', 'age', 'sex_encoder']]
y_test = test[['survived']]

logit3.score(X_test, y_test)

0.8044692737430168

In [41]:
X_test = test[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_test = test[['survived']]

logit4.score(X_test, y_test)

0.8100558659217877

#### Looks like model 4 is the best fit without overfitting for the train set.

### 5. **Bonus**: How do different strategies for handling the missing values in the age column affect model performance?

By choosing to drop all the NA's caused major issues within the models. By imputing values the models were able to run without issue.

### 6. **Bonus**: How do different strategies for encoding sex affect model performance?

Because the choice is binary, the difference between One-Hot and Label encoding will result in similar findings.

### 7. **Bonus**: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

> Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

> $C=.01,.1,1,10,100,1000$

In [42]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit5 = LogisticRegression(random_state=123, C=.01).fit(X, y)

In [43]:
logit5.score(X, y)

0.7065026362038664

In [44]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit5.score(X_validate, y_validate)

0.6853146853146853

In [45]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit6 = LogisticRegression(random_state=123, C=.1).fit(X, y)
logit6.score(X, y)

0.7662565905096661

In [46]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit6.score(X_validate, y_validate)

0.7552447552447552

In [47]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit7 = LogisticRegression(random_state=123, C=1.0).fit(X, y)
logit7.score(X, y)

0.7803163444639719

In [48]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit7.score(X_validate, y_validate)

0.8181818181818182

In [49]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit8 = LogisticRegression(random_state=123, C=10.0).fit(X, y)
logit8.score(X, y)

0.7768014059753954

In [50]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit8.score(X_validate, y_validate)

0.8041958041958042

In [51]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit9 = LogisticRegression(random_state=123, C=100.0).fit(X, y)
logit9.score(X, y)

0.7768014059753954

In [52]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit9.score(X_validate, y_validate)

0.8041958041958042

In [53]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit10 = LogisticRegression(random_state=123, C=1000.0).fit(X, y)
logit10.score(X, y)

0.7768014059753954

In [54]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit10.score(X_validate, y_validate)

0.7972027972027972

### - 8. **Bonus Bonus**: how does scaling the data interact with your choice of C?

In [59]:
train_scaled = train[['age', 'fare']]
validate_scaled = validate[['age', 'fare']]
scaler, train_scaled, validate_scaled = split_scale.min_max_scaler(train_scaled, validate_scaled)
train_scaled.head()

Unnamed: 0,age,fare
626,0.768959,0.024106
107,0.399719,0.015176
168,0.399719,0.050602
666,0.334058,0.025374
25,0.510737,0.061264


In [63]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=.01).fit(X, y)
logit11.score(X, y)

0.6115992970123023

In [64]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=.10).fit(X, y)
logit11.score(X, y)

0.6151142355008787

In [65]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=1.0).fit(X, y)
logit11.score(X, y)

0.6362038664323374

In [66]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=10.0).fit(X, y)
logit11.score(X, y)

0.6502636203866432

In [68]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=100.0).fit(X, y)
logit11.score(X, y)

0.655536028119508

In [70]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=1000.0).fit(X, y)
logit11.score(X, y)

0.6572934973637962