In [1]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

import os
from env import get_connection

import acquire
import prepare

#### Create a new notebook, logistic_regression, use it to answer the following questions:

1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

3. Try out other combinations of features and models.

4. Use you best 3 models to predict and evaluate on your validate sample.

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [2]:
# acquire data using function from acquire
titanic = acquire.get_titanic_data()

# look in data
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
# create function to clean dataframe
def prep_titanic(df):
    
    # drop unwanted columns
    df.drop(columns = ['embarked','deck', 'passenger_id', 'sex', 'sibsp', 'parch', 'embarked', 'class', 'deck','embark_town', 'alone'], inplace=True )
    
    # fill null values with 0
    df.fillna(value =0, inplace=True)
    return df

In [4]:
titanic = prep_titanic(titanic)
titanic.head()

Unnamed: 0,survived,pclass,age,fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [5]:
# peek into clean data
titanic.head(3)

Unnamed: 0,survived,pclass,age,fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925


In [6]:
# split data using funciton form prepare module 
train, val, test = prepare.train_validate_test_split(titanic,'survived')

# get shape of train, validate and test data
train.shape, val.shape, test.shape

((498, 4), (214, 4), (179, 4))

In [7]:
# check data in train
train.head()

Unnamed: 0,survived,pclass,age,fare
779,1,1,43.0,211.3375
159,0,3,0.0,69.55
738,0,3,0.0,7.8958
486,1,1,35.0,90.0
125,1,3,12.0,11.2417


In [8]:
# create labels
X_train_1 = train.drop(columns = ['survived'])
y_train_1 = train['survived']

X_val_1 =val.drop(columns = ['survived'])
y_val_1 = val['survived']

X_test_1 = test.drop(columns = ['survived'])
y_test_1 = test['survived']

In [9]:
# baseline accuracy
matches_baseline_prediction = (y_train_1 == 0)
matches_baseline_prediction
baseline_accuracy = matches_baseline_prediction.mean()
y_train_pred = pd.Series([0]*len(y_train_1))
baseline_accuracy

0.6164658634538153

In [10]:
# compute accuracy of baseline
cm = confusion_matrix(y_train_1, y_train_pred)
tn, fp, fn, tp = cm.ravel()

In [11]:
# set seed
seed = 42

# create model_1
logit = LogisticRegression(random_state = seed, )

# fit model
logit.fit(X_train_1, y_train_1)

# compute accuracy
train_acc_1 = logit.score(X_train_1, y_train_1)
print(baseline_accuracy, train_acc_1)

0.6164658634538153 0.7088353413654619


Model performs better than baseline accuracy

### Question 2

Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [12]:
titanic2 = acquire.get_titanic_data()

In [13]:
titanic2.drop(columns = ['embarked','deck', 'passenger_id', 'sibsp', 'parch', 'embarked', 'class', 'deck','embark_town', 'alone'], inplace=True )
titanic2.fillna(value =0, inplace=True)

In [14]:
titanic2.head()

Unnamed: 0,survived,pclass,sex,age,fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [15]:
dummies = pd.get_dummies(titanic2['sex'], drop_first=True)
dummies.head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [16]:
titanic2 = pd.concat([titanic2, dummies], axis=1)
titanic2.head()

Unnamed: 0,survived,pclass,sex,age,fare,male
0,0,3,male,22.0,7.25,1
1,1,1,female,38.0,71.2833,0
2,1,3,female,26.0,7.925,0
3,1,1,female,35.0,53.1,0
4,0,3,male,35.0,8.05,1


In [17]:
titanic2.drop(columns='sex', inplace=True)
titanic2.head()

Unnamed: 0,survived,pclass,age,fare,male
0,0,3,22.0,7.25,1
1,1,1,38.0,71.2833,0
2,1,3,26.0,7.925,0
3,1,1,35.0,53.1,0
4,0,3,35.0,8.05,1


In [18]:
# split data using funciton form prepare module 
train_2, val_2, test_2 = prepare.train_validate_test_split(titanic2,'survived')

# get shape of train, validate and test data
train_2.shape, val_2.shape, test_2.shape

((498, 5), (214, 5), (179, 5))

In [19]:
X_train_2 = train_2.drop(columns = ['survived'])
y_train_2 = train_2['survived']

X_val_2 =val_2.drop(columns = ['survived'])
y_val_2 = val_2['survived']

X_test_2 = test_2.drop(columns = ['survived'])
y_test_2 = test_2['survived']

In [20]:
# set seed
seed = 42

# create model_2
logit2 = LogisticRegression(random_state = seed, )

# fit model
logit2.fit(X_train_2, y_train_2)

# compute accuracy
train_acc_2 = logit2.score(X_train_2, y_train_2)
print(baseline_accuracy, train_acc_2)

0.6164658634538153 0.7791164658634538


### 3. Try out other combinations of features and models.

In [21]:
# set seed
seed = 42

# create model_3
logit3 = LogisticRegression(C = 0.1, random_state = seed, )

# fit model
logit3.fit(X_train_1, y_train_1)

# compute accuracy
train_acc_3 = logit.score(X_train_1, y_train_1)
print(baseline_accuracy, train_acc_3)

0.6164658634538153 0.7088353413654619


#### 4. Use you best 3 models to predict and evaluate on your validate sample.


In [22]:
y_val_pred_1 = logit.predict(X_val_1)
val_acc_1 = logit.score(X_val_1, y_val_1)

model1 = [1, train_acc_1, val_acc_1]


y_val_pred_2 = logit2.predict(X_val_2)
val_acc_2 = logit2.score(X_val_2, y_val_2)

model2 = [2, train_acc_2, val_acc_2]

y_val_pred_3 = logit3.predict(X_val_1)
val_acc_3 = logit3.score(X_val_1, y_val_1)

model3 = [3, train_acc_3, val_acc_3]


pd.DataFrame([model1, model2, model1], columns=['model', 
                                                'in-sample accuracy',
                                                'out-of-sample accuracy'
                                               ])


Unnamed: 0,model,in-sample accuracy,out-of-sample accuracy
0,1,0.708835,0.700935
1,2,0.779116,0.785047
2,1,0.708835,0.700935


Model2 performs the best because it has the highest score on both trian and validate data.

#### 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [23]:
test_preds_3 = logit2.predict(X_test_2)
test_acc_3 = logit2.score(X_test_2, y_test_2)
print('Model 2')
print('Test Accuracy 3:  ', test_acc_3)
report = classification_report(y_test_2, test_preds_3, output_dict=True)
pd.DataFrame(report)

Model 2
Test Accuracy 3:   0.7821229050279329


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.814159,0.727273,0.782123,0.770716,0.780667
recall,0.836364,0.695652,0.782123,0.766008,0.782123
f1-score,0.825112,0.711111,0.782123,0.768112,0.781168
support,110.0,69.0,0.782123,179.0,179.0


Model 2 has about same accuracy score of 78% on vaildate and test data