In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
import sklearn.impute
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare
import split_scale

# Classification Modeling Exercise

In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Do your work for these exercises in either a notebook or a python script named model within your classification directory.

1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?
2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.
3. Try out other combinations of features and models.
4. Choose you best model and evaluate it on the test dataset. Is it overfit?
5. **Bonus**: How do different strategies for handling the missing values in the age column affect model performance?
6. **Bonus**: How do different strategies for encoding sex affect model performance?
7. **Bonus**: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

$C=.01,.1,1,10,100,1000$

- 8. **Bonus Bonus**: how does scaling the data interact with your choice of C?

In [2]:
df = acquire.get_titanic_data()

In [3]:
df = df.drop(columns="deck")

In [4]:
# fix embarktown and embarked

df.embark_town = df.embark_town.fillna('Southampton')
df.embarked = df.embarked.fillna('S')

In [5]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1


### Initial Model

In [6]:
train, test = train_test_split(df, random_state=123, train_size=.8)
train, validate = train_test_split(train, train_size=.8)

X = train[['pclass', 'fare']]
y = train[['survived']]

logit1 = LogisticRegression(random_state=123).fit(X, y)

In [7]:
logit1.predict(X)

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

In [8]:
logit1.predict_proba(X)

array([[0.35329857, 0.64670143],
       [0.58564404, 0.41435596],
       [0.57378206, 0.42621794],
       ...,
       [0.74373341, 0.25626659],
       [0.29445323, 0.70554677],
       [0.57378206, 0.42621794]])

In [9]:
logit1.score(X, y)

0.6766256590509666

### 1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [10]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
262,262,0,1,male,52.0,1,1,79.65,S,First,Southampton,0
733,733,0,2,male,23.0,0,0,13.0,S,Second,Southampton,1
426,426,1,2,female,28.0,1,0,26.0,S,Second,Southampton,0
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0
248,248,1,1,male,37.0,1,1,52.5542,S,First,Southampton,0


In [11]:
imputer = sklearn.impute.SimpleImputer(strategy='mean')
imputer.fit(train[['age']])
train.age = imputer.transform(train[['age']])
validate.age = imputer.transform(validate[['age']])
test.age = imputer.transform(test[['age']])

In [12]:
train.isna().sum()

passenger_id    0
survived        0
pclass          0
sex             0
age             0
sibsp           0
parch           0
fare            0
embarked        0
class           0
embark_town     0
alone           0
dtype: int64

In [13]:
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

    test: 179 rows x 12 columns
   train: 569 rows x 12 columns
validate: 143 rows x 12 columns


In [14]:
X = train[['pclass', 'fare', 'age']]
y = train[['survived']]

logit2 = LogisticRegression(random_state=123).fit(X, y)

In [15]:
logit2.predict(X)

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [16]:
logit2.predict_proba(X)

array([[0.48074256, 0.51925744],
       [0.52846291, 0.47153709],
       [0.55543288, 0.44456712],
       ...,
       [0.8764336 , 0.1235664 ],
       [0.13987057, 0.86012943],
       [0.70799871, 0.29200129]])

In [17]:
logit2.score(X, y)

0.7012302284710018

In [18]:
X_validate = validate[['pclass', 'fare', 'age']]
y_validate = validate[['survived']]

In [19]:
logit2.score(X_validate, y_validate)

0.6643356643356644

### 2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.

In [20]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
262,262,0,1,male,52.0,1,1,79.65,S,First,Southampton,0
733,733,0,2,male,23.0,0,0,13.0,S,Second,Southampton,1
426,426,1,2,female,28.0,1,0,26.0,S,Second,Southampton,0
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0
248,248,1,1,male,37.0,1,1,52.5542,S,First,Southampton,0


In [21]:
# encode sex for both train and test
le = LabelEncoder()
train['sex_encoder'] = le.fit_transform(train.sex)
validate['sex_encoder'] = le.fit_transform(validate.sex)
test['sex_encoder'] = le.transform(test.sex)

In [22]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,sex_encoder
262,262,0,1,male,52.0,1,1,79.65,S,First,Southampton,0,1
733,733,0,2,male,23.0,0,0,13.0,S,Second,Southampton,1,1
426,426,1,2,female,28.0,1,0,26.0,S,Second,Southampton,0,0
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,1
248,248,1,1,male,37.0,1,1,52.5542,S,First,Southampton,0,1


In [23]:
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

    test: 179 rows x 13 columns
   train: 569 rows x 13 columns
validate: 143 rows x 13 columns


In [24]:
X = train[['pclass', 'fare', 'age', 'sex_encoder']]
y = train[['survived']]

logit3 = LogisticRegression(random_state=123).fit(X, y)

In [25]:
logit3.predict(X)

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,

In [26]:
logit3.predict_proba(X)

array([[0.66207621, 0.33792379],
       [0.73217933, 0.26782067],
       [0.24377127, 0.75622873],
       ...,
       [0.92965552, 0.07034448],
       [0.06257723, 0.93742277],
       [0.34019546, 0.65980454]])

In [27]:
logit3.score(X, y)

0.7855887521968365

In [28]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder']]
y_validate = validate[['survived']]

logit3.score(X_validate, y_validate)

0.8251748251748252

### 3. Try out other combinations of features and models.

In [29]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,sex_encoder
262,262,0,1,male,52.0,1,1,79.65,S,First,Southampton,0,1
733,733,0,2,male,23.0,0,0,13.0,S,Second,Southampton,1,1
426,426,1,2,female,28.0,1,0,26.0,S,Second,Southampton,0,0
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,1
248,248,1,1,male,37.0,1,1,52.5542,S,First,Southampton,0,1


In [30]:
# encoding embarked

encoder = sklearn.preprocessing.OneHotEncoder()
encoder.fit(train[["embarked"]])

m1 = encoder.transform(train[["embarked"]]).todense()

train = pd.concat([train, pd.DataFrame(m1, columns=encoder.categories_[0], index=train.index)], axis=1)

m2 = encoder.transform(validate[["embarked"]]).todense()

validate = pd.concat([validate, pd.DataFrame(m2, columns=encoder.categories_[0], index=validate.index)], axis=1)

m3 = encoder.transform(test[["embarked"]]).todense()

test = pd.concat([test, pd.DataFrame(m3, columns=encoder.categories_[0], index=test.index)], axis=1)

In [31]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,sex_encoder,C,Q,S
262,262,0,1,male,52.0,1,1,79.65,S,First,Southampton,0,1,0.0,0.0,1.0
733,733,0,2,male,23.0,0,0,13.0,S,Second,Southampton,1,1,0.0,0.0,1.0
426,426,1,2,female,28.0,1,0,26.0,S,Second,Southampton,0,0,0.0,0.0,1.0
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,1,0.0,0.0,1.0
248,248,1,1,male,37.0,1,1,52.5542,S,First,Southampton,0,1,0.0,0.0,1.0


In [32]:
print('    test: %d rows x %d columns' % test.shape)
print('   train: %d rows x %d columns' % train.shape)
print('validate: %d rows x %d columns' % validate.shape)

    test: 179 rows x 16 columns
   train: 569 rows x 16 columns
validate: 143 rows x 16 columns


In [33]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit4 = LogisticRegression(random_state=123).fit(X, y)

In [34]:
logit4.predict(X)

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [35]:
logit4.predict_proba(X)

array([[0.68201839, 0.31798161],
       [0.73344056, 0.26655944],
       [0.24277688, 0.75722312],
       ...,
       [0.9361803 , 0.0638197 ],
       [0.06322297, 0.93677703],
       [0.34952696, 0.65047304]])

In [36]:
logit4.score(X, y)

0.7855887521968365

In [37]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit4.score(X_validate, y_validate)

0.8181818181818182

### 4. Choose you best model and evaluate it on the test dataset. Is it overfit?

In [38]:
X_test = test[['pclass', 'fare']]
y_test = test[['survived']]

logit1.score(X_test, y_test)

0.7039106145251397

In [39]:
X_test = test[['pclass', 'fare', 'age']]
y_test = test[['survived']]

logit2.score(X_test, y_test)

0.7318435754189944

In [40]:
X_test = test[['pclass', 'fare', 'age', 'sex_encoder']]
y_test = test[['survived']]

logit3.score(X_test, y_test)

0.8044692737430168

In [41]:
X_test = test[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_test = test[['survived']]

logit4.score(X_test, y_test)

0.8100558659217877

#### Looks like model 4 is the best fit without overfitting for the train set.

### 5. **Bonus**: How do different strategies for handling the missing values in the age column affect model performance?

By choosing to drop all the NA's caused major issues within the models. By imputing values the models were able to run without issue.

### 6. **Bonus**: How do different strategies for encoding sex affect model performance?

Because the choice is binary, the difference between One-Hot and Label encoding will result in similar findings.

### 7. **Bonus**: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

> Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

> $C=.01,.1,1,10,100,1000$

In [42]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit5 = LogisticRegression(random_state=123, C=.01).fit(X, y)

In [43]:
logit5.score(X, y)

0.6977152899824253

In [44]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit5.score(X_validate, y_validate)

0.7062937062937062

In [45]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit6 = LogisticRegression(random_state=123, C=.1).fit(X, y)
logit6.score(X, y)

0.7609841827768014

In [46]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit6.score(X_validate, y_validate)

0.7972027972027972

In [47]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit7 = LogisticRegression(random_state=123, C=1.0).fit(X, y)
logit7.score(X, y)

0.7855887521968365

In [48]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit7.score(X_validate, y_validate)

0.8181818181818182

In [49]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit8 = LogisticRegression(random_state=123, C=10.0).fit(X, y)
logit8.score(X, y)

0.7855887521968365

In [50]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit8.score(X_validate, y_validate)

0.8111888111888111

In [51]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit9 = LogisticRegression(random_state=123, C=100.0).fit(X, y)
logit9.score(X, y)

0.7873462214411248

In [52]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit9.score(X_validate, y_validate)

0.8111888111888111

In [53]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

logit10 = LogisticRegression(random_state=123, C=1000.0).fit(X, y)
logit10.score(X, y)

0.7855887521968365

In [54]:
X_validate = validate[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y_validate = validate[['survived']]

logit10.score(X_validate, y_validate)

0.8111888111888111

### - 8. **Bonus Bonus**: how does scaling the data interact with your choice of C?

In [55]:
train_scaled = train[['age', 'fare']]
validate_scaled = validate[['age', 'fare']]
scaler, train_scaled, validate_scaled = split_scale.min_max_scaler(train_scaled, validate_scaled)
train_scaled.head()

Unnamed: 0,age,fare
262,0.647044,0.155466
733,0.281482,0.025374
426,0.34451,0.050749
0,0.268877,0.014151
248,0.45796,0.102579


In [56]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=.01).fit(X, y)
logit11.score(X, y)

0.6133567662565905

In [57]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=.10).fit(X, y)
logit11.score(X, y)

0.616871704745167

In [58]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=1.0).fit(X, y)
logit11.score(X, y)

0.6274165202108963

In [59]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=10.0).fit(X, y)
logit11.score(X, y)

0.655536028119508

In [60]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=100.0).fit(X, y)
logit11.score(X, y)

0.6449912126537786

In [61]:
X = train_scaled[['fare', 'age']]
y = train[['survived']]

logit11 = LogisticRegression(random_state=123, C=1000.0).fit(X, y)
logit11.score(X, y)

0.6449912126537786

# Class Logic Regression Breakdown Review Section

# Classification Decision Tree Exercises

1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)
1. Evaluate your in-sample results using the model score, confusion matrix, and classification report.
1. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
1. Run through steps 2-4 using entropy as your measure of impurity.
1. Which performs better on your in-sample data?

In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [63]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,sex_encoder,C,Q,S
262,262,0,1,male,52.0,1,1,79.65,S,First,Southampton,0,1,0.0,0.0,1.0
733,733,0,2,male,23.0,0,0,13.0,S,Second,Southampton,1,1,0.0,0.0,1.0
426,426,1,2,female,28.0,1,0,26.0,S,Second,Southampton,0,0,0.0,0.0,1.0
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,1,0.0,0.0,1.0
248,248,1,1,male,37.0,1,1,52.5542,S,First,Southampton,0,1,0.0,0.0,1.0


In [64]:
X = train[['pclass', 'fare', 'age', 'sex_encoder', 'C', 'Q', 'S']]
y = train[['survived']]

In [65]:
clf = DecisionTreeClassifier(max_depth=7, random_state=123)
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [66]:
clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [67]:
y_pred = clf.predict(X)
y_pred_proba = clf.predict_proba(X)

In [68]:
y_pred[0:10]

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1])

In [69]:
y_pred_proba

array([[1.        , 0.        ],
       [0.85789474, 0.14210526],
       [0.        , 1.        ],
       ...,
       [0.85789474, 0.14210526],
       [1.        , 0.        ],
       [0.16666667, 0.83333333]])

In [70]:
#confusion_matrix = pd.DataFrame(confusion_matrix(y, y_pred))
#confusion_matrix.index.name = 'actual'
confusion_matrix

<function sklearn.metrics.classification.confusion_matrix(y_true, y_pred, labels=None, sample_weight=None)>

In [71]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(clf.score(X, y)))

Accuracy of Decision Tree classifier on training set: 0.88


In [72]:
pd.DataFrame(classification_report(y, y_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.871391,0.909574,0.884007,0.890483,0.886154
recall,0.951289,0.777273,0.884007,0.864281,0.884007
f1-score,0.909589,0.838235,0.884007,0.873912,0.882001
support,349.0,220.0,0.884007,569.0,569.0


In [73]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on test set: 0.83


In [74]:
import graphviz
from graphviz import Graph

clf = DecisionTreeClassifier()
clf = clf.fit(X, y)

dot_data = export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [75]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.83


# Classification Random Forest Exercises

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.
2. Evaluate your results using the model score, confusion matrix, and classification report.
3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.
5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [76]:
from sklearn.ensemble import RandomForestClassifier

In [77]:
titanic = acquire.get_titanic_data()

In [78]:
def encode_sex(df):
    '''
    Returns a new dataframe with the ``sex`` column encoded.
    '''
    return df.assign(
        sex=(df.sex == 'female').astype(int)
    )

In [79]:
titanic = encode_sex(titanic)
titanic.age = titanic.age.fillna(titanic.age.mean()).astype("int")
titanic = titanic.drop(columns=["passenger_id", "embarked", "class", "deck", "embark_town"])

In [80]:
train, test = train_test_split(titanic)

In [81]:
X_train = train.drop(columns="survived")
y_train = train["survived"]
X_test = test.drop(columns="survived")
y_test = test["survived"]

In [82]:
y_train.shape

(668,)

In [83]:
rf = RandomForestClassifier(random_state= 123, min_samples_leaf = 1, max_depth = 20)

rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [84]:
rf.feature_importances_

array([0.08903795, 0.24683774, 0.28296102, 0.0538293 , 0.03762438,
       0.27789037, 0.01181923])

## Evaluate the Model

In [85]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [86]:
print(confusion_matrix(y_train, rf.predict(X_train)))

[[414   7]
 [ 14 233]]


In [87]:
print(classification_report(y_train, rf.predict(X_train)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       421
           1       0.97      0.94      0.96       247

    accuracy                           0.97       668
   macro avg       0.97      0.96      0.97       668
weighted avg       0.97      0.97      0.97       668



## Test the Model

In [88]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on test set: 0.80


### 2. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.¶

In [89]:
rf = RandomForestClassifier(random_state= 123, min_samples_leaf = 5, max_depth = 3)

rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

## Evaluate the Model

In [91]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.81


In [92]:
print(confusion_matrix(y_train, rf.predict(X_train)))

[[406  15]
 [110 137]]


In [93]:
print(classification_report(y_train, rf.predict(X_train)))

              precision    recall  f1-score   support

           0       0.79      0.96      0.87       421
           1       0.90      0.55      0.69       247

    accuracy                           0.81       668
   macro avg       0.84      0.76      0.78       668
weighted avg       0.83      0.81      0.80       668



## Test the Model

In [94]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on test set: 0.78
