### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as warn
import sys
import env
import numpy as np
import graphviz

sys.path.append(env.util_repo)

from acquire import get_titanic_data
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from graphviz import Graph
from sklearn.linear_model import LogisticRegression

warn.filterwarnings("ignore")

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [3]:
titanic_df = get_titanic_data()
titanic_df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
titanic_df.shape

(891, 14)

In [5]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    891 non-null    int64  
 1   passenger_id  891 non-null    int64  
 2   survived      891 non-null    int64  
 3   pclass        891 non-null    int64  
 4   sex           891 non-null    object 
 5   age           714 non-null    float64
 6   sibsp         891 non-null    int64  
 7   parch         891 non-null    int64  
 8   fare          891 non-null    float64
 9   embarked      889 non-null    object 
 10  class         891 non-null    object 
 11  deck          203 non-null    object 
 12  embark_town   889 non-null    object 
 13  alone         891 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 97.6+ KB


In [6]:
# Drop these columns due to redundancy, missing values, irrelevant
titanic_df.drop(columns=['Unnamed: 0', 'passenger_id', 'age', 'sibsp', 'parch', 'deck', 'embarked', 'class'], inplace=True)
titanic_df.head()

Unnamed: 0,survived,pclass,sex,fare,embark_town,alone
0,0,3,male,7.25,Southampton,0
1,1,1,female,71.2833,Cherbourg,0
2,1,3,female,7.925,Southampton,1
3,1,1,female,53.1,Southampton,0
4,0,3,male,8.05,Southampton,1


In [7]:
titanic_df.embark_town.value_counts()
titanic_df.fillna(value="Southampton", inplace=True)
titanic_df.embark_town.value_counts()

Southampton    646
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [8]:
sex_dummies = pd.get_dummies(titanic_df.sex, dummy_na=False, drop_first=False)
sex_dummies.head()

embark_dummies = pd.get_dummies(titanic_df.embark_town, dummy_na=False, drop_first=False)
embark_dummies.head()

titanic_df = pd.concat([titanic_df, sex_dummies, embark_dummies], axis=1)
titanic_df.head()

titanic_df.drop(columns=['sex', 'embark_town'], inplace=True)
titanic_df.head()

Unnamed: 0,survived,pclass,fare,alone,female,male,Cherbourg,Queenstown,Southampton
0,0,3,7.25,0,0,1,0,0,1
1,1,1,71.2833,0,1,0,1,0,0
2,1,3,7.925,1,1,0,0,0,1
3,1,1,53.1,0,1,0,0,0,1
4,0,3,8.05,1,0,1,0,0,1


In [9]:
# Split the data
train, test = train_test_split(titanic_df, test_size=.2, random_state=1414, stratify=titanic_df['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=1414, stratify=train['survived'])

In [90]:
train.survived.mean()

0.38353413654618473

In [91]:
# Baseline prediction is that passenger did NOT survive (survived = 0)
train['baseline_survived'] = 0
train.head()

Unnamed: 0,survived,pclass,fare,alone,female,male,Cherbourg,Queenstown,Southampton,baseline_survived
824,0,3,39.6875,0,0,1,0,0,1,0
822,0,1,0.0,1,0,1,0,0,1,0
149,0,2,13.0,1,0,1,0,0,1,0
752,0,3,9.5,1,0,1,0,0,1,0
94,0,3,7.25,1,0,1,0,0,1,0


In [92]:
baseline_accuracy = (train.survived == train.baseline_survived).mean()
print(f"baseline accuracy:  {baseline_accuracy:.2%}")

train.drop(columns='baseline_survived')

baseline accuracy:  61.65%


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [10]:
# Prepare inputs (X) and targets (y)
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns='survived')
y_validate = validate.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [113]:
# Construct classification tree object
clf = DecisionTreeClassifier(max_depth=3, random_state=1414)

In [114]:
# Fit the model to the training data
clf = clf.fit(X_train, y_train)

In [115]:
# Graphviz visualization
dot_data = export_graphviz(clf, feature_names=X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [116]:
# Predictions
y_pred = clf.predict(X_train)
y_pred[0:14]

array([0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0])

In [117]:
# Probability of outcome
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:14]

array([[0.88546256, 0.11453744],
       [1.        , 0.        ],
       [0.88546256, 0.11453744],
       [0.88546256, 0.11453744],
       [0.88546256, 0.11453744],
       [0.31818182, 0.68181818],
       [0.31818182, 0.68181818],
       [0.62686567, 0.37313433],
       [0.31818182, 0.68181818],
       [0.        , 1.        ],
       [0.73076923, 0.26923077],
       [0.31818182, 0.68181818],
       [0.88546256, 0.11453744],
       [0.62686567, 0.37313433]])

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [118]:
# Accuracy
print(f"Accuracy for training sample:  {clf.score(X_train, y_train):.2%}")

Accuracy for training sample:  83.73%


In [119]:
# Confusion matrix
pd.DataFrame(confusion_matrix(y_train, y_pred, labels=(0, 1)), index=['acutal died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
acutal died,285,22
actual survived,59,132


In [120]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       307
           1       0.86      0.69      0.77       191

    accuracy                           0.84       498
   macro avg       0.84      0.81      0.82       498
weighted avg       0.84      0.84      0.83       498



### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [122]:
# Validate set accuracy
print(f"Accuracy for validate sample:  {clf.score(X_validate, y_validate):.2%}")

Accuracy for validate sample:  77.10%


In [123]:
# Predict outcomes of validate sample with model
y_validate_pred = clf.predict(X_validate)

In [127]:
# Confusion matrix
pd.DataFrame(confusion_matrix(y_validate, y_validate_pred, labels=(0, 1)), index=['acutal died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
acutal died,110,22
actual survived,27,55


In [124]:
print(classification_report(y_validate, y_validate_pred))

              precision    recall  f1-score   support

           0       0.80      0.83      0.82       132
           1       0.71      0.67      0.69        82

    accuracy                           0.77       214
   macro avg       0.76      0.75      0.75       214
weighted avg       0.77      0.77      0.77       214



### 5. Run through steps 2-4 using a different max_depth value.

In [128]:
# Construct classification tree object
clf = DecisionTreeClassifier(max_depth=7, random_state=1414)

In [129]:
# Fit the model to the training data
clf = clf.fit(X_train, y_train)

In [130]:
# Graphviz visualization
dot_data = export_graphviz(clf, feature_names=X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data)

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [131]:
# Predictions
y_pred = clf.predict(X_train)
y_pred[0:14]

array([0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0])

In [132]:
# Accuracy
print(f"Accuracy for training sample:  {clf.score(X_train, y_train):.2%}")

Accuracy for training sample:  89.16%


In [133]:
# Confusion matrix
pd.DataFrame(confusion_matrix(y_train, y_pred, labels=(0, 1)), index=['acutal died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
acutal died,284,23
actual survived,31,160


In [134]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.93      0.91       307
           1       0.87      0.84      0.86       191

    accuracy                           0.89       498
   macro avg       0.89      0.88      0.88       498
weighted avg       0.89      0.89      0.89       498



In [135]:
# Validate set accuracy
print(f"Accuracy for validate sample:  {clf.score(X_validate, y_validate):.2%}")

Accuracy for validate sample:  73.83%


In [136]:
# Predict outcomes of validate sample with model
y_validate_pred = clf.predict(X_validate)

In [139]:
# Confusion matrix
pd.DataFrame(confusion_matrix(y_validate, y_validate_pred, labels=(0, 1)),
             index=['acutal died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
acutal died,100,32
actual survived,24,58


In [138]:
print(classification_report(y_validate, y_validate_pred))

              precision    recall  f1-score   support

           0       0.81      0.76      0.78       132
           1       0.64      0.71      0.67        82

    accuracy                           0.74       214
   macro avg       0.73      0.73      0.73       214
weighted avg       0.74      0.74      0.74       214



### 6. Which model performs better on your in-sample data?

<p>The model with a higher max_depth value performs better on training data

### 7. Which model performs best on your out-of-sample data, the validate set?

<p>The model with max_depth=3 performs better on the validate set.

# Random Forest Exercises

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [142]:
rf = RandomForestClassifier(n_estimators=100, bootstrap=True, class_weight=None, criterion="gini",
                            min_samples_leaf=1, max_depth=10, random_state=1414)

In [143]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=1414)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [157]:
print(rf.feature_importances_)

[0.11009405 0.42861474 0.03640365 0.16164023 0.21105173 0.0233666
 0.00900006 0.01982894]


In [158]:
y_rf_pred = rf.predict(X_train)

In [159]:
y_rf_pred_proba = rf.predict_proba(X_train)

In [160]:
print(f"Random forest training sample accuracy:  {rf.score(X_train, y_train):.2%}")

Random forest training sample accuracy:  94.18%


In [161]:
pd.DataFrame(confusion_matrix(y_train, y_rf_pred, labels=(0, 1)),
             index=['actual died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
actual died,298,9
actual survived,20,171


In [162]:
print(classification_report(y_train, y_rf_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       307
           1       0.95      0.90      0.92       191

    accuracy                           0.94       498
   macro avg       0.94      0.93      0.94       498
weighted avg       0.94      0.94      0.94       498



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [163]:
y_rf_validate_pred = rf.predict(X_validate)

In [164]:
print(f"Random forest validate sample accuracy:  {rf.score(X_validate, y_validate):.2%}")

Random forest validate sample accuracy:  73.83%


In [165]:
pd.DataFrame(confusion_matrix(y_validate, y_rf_validate_pred, labels=(0, 1)),
             index=['actual died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
actual died,100,32
actual survived,24,58


In [166]:
print(classification_report(y_validate, y_rf_validate_pred))

              precision    recall  f1-score   support

           0       0.81      0.76      0.78       132
           1       0.64      0.71      0.67        82

    accuracy                           0.74       214
   macro avg       0.73      0.73      0.73       214
weighted avg       0.74      0.74      0.74       214



### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [167]:
rf_modify = RandomForestClassifier(n_estimators=100, bootstrap=True, class_weight=None, criterion="gini",
                            min_samples_leaf=4, max_depth=4, random_state=1414)

In [168]:
rf_modify.fit(X_train, y_train)

RandomForestClassifier(max_depth=4, min_samples_leaf=4, random_state=1414)

In [169]:
print(f"Random forest modified training sample accuracy:  {rf_modify.score(X_train, y_train):.2%}")

Random forest modified training sample accuracy:  83.73%


In [170]:
y_rf_modify_pred = rf_modify.predict(X_train)

In [171]:
pd.DataFrame(confusion_matrix(y_train, y_rf_modify_pred, labels=(0, 1)),
             index=['actual died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
actual died,285,22
actual survived,59,132


In [172]:
print(classification_report(y_train, y_rf_modify_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       307
           1       0.86      0.69      0.77       191

    accuracy                           0.84       498
   macro avg       0.84      0.81      0.82       498
weighted avg       0.84      0.84      0.83       498



In [177]:
print(f"Random forest modified validate sample accuracy:  {rf_modify.score(X_validate, y_validate):.2%}")

Random forest modified validate sample accuracy:  76.64%


In [174]:
y_rf_modify_validate_pred = rf_modify.predict(X_validate)

In [175]:
pd.DataFrame(confusion_matrix(y_validate, y_rf_modify_validate_pred, labels=(0, 1)),
             index=['actual died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
actual died,110,22
actual survived,28,54


In [176]:
print(classification_report(y_validate, y_rf_modify_validate_pred))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81       132
           1       0.71      0.66      0.68        82

    accuracy                           0.77       214
   macro avg       0.75      0.75      0.75       214
weighted avg       0.76      0.77      0.76       214



### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

> The in-sample accuracy went down by ~10%, but the out-of-sample accuracy increased by ~3%. Also, while the precision, recall, and f1 scores decrease on in-sample data, these values increase for out-of-sample data. The random forest model with higher max_depth and lower min_sample_leaf values performs better on in-sample data because it is overfit to the training data.

In [183]:
# Testing read_clipboard
pd.read_clipboard(sep=",")

Unnamed: 0,a,b,c,d,target
0,5.7,2.6,3.5,1.0,versicolor
1,5.5,3.5,1.3,0.2,setosa
2,6.3,2.8,5.1,1.5,virginica
3,6.3,2.8,5.1,1.4,predict_viriginca
4,6.25,2.77,5.09,1.35,predict_virginica
5,5.5,3.5,1.29,0.3,setosa


# KNN Exercises

### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [2]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [11]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [12]:
y_knn_pred = knn.predict(X_train)

In [13]:
print(f"KNN training sample accuracy:  {knn.score(X_train, y_train):.2%}")

KNN training sample accuracy:  83.73%


In [14]:
pd.DataFrame(confusion_matrix(y_train, y_knn_pred, labels=(0, 1)),
             index=['actual died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
actual died,270,37
actual survived,44,147


In [15]:
print(classification_report(y_train, y_knn_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       307
           1       0.80      0.77      0.78       191

    accuracy                           0.84       498
   macro avg       0.83      0.82      0.83       498
weighted avg       0.84      0.84      0.84       498



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support

In [29]:
y_knn_validate_pred = knn.predict(X_validate)

In [30]:
print(f"KNN validate sample accuracy:  {knn.score(X_validate, y_validate):.2%}")

KNN validate sample accuracy:  68.69%


In [31]:
pd.DataFrame(confusion_matrix(y_validate, y_knn_validate_pred, labels=(0, 1)),
             index=['actual died', 'actual survived'], columns=['predicted died', 'predicted survived'])

Unnamed: 0,predicted died,predicted survived
actual died,100,32
actual survived,35,47


In [19]:
print(classification_report(y_validate, y_knn_validate_pred))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75       132
           1       0.59      0.57      0.58        82

    accuracy                           0.69       214
   macro avg       0.67      0.67      0.67       214
weighted avg       0.68      0.69      0.69       214



In [12]:
def get_metrics_bin(clf, X, y):
    '''
    get_metrics_bin will take in a sklearn classifier model, an X and a y variable and utilize
    the model to make a prediction and then gather accuracy, class report evaluations

    return:  a classification report as a pandas DataFrame
    '''
    y_pred = clf.predict(X)
    accuracy = clf.score(X, y)
    conf = confusion_matrix(y, y_pred)
    class_report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).T
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

### 4. Run through steps 2-4 setting k to 10

In [21]:
knn_10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [22]:
knn_10.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [23]:
get_metrics_bin(knn_10, X_train, y_train)


    The accuracy for our model is 0.8193
    The True Positive Rate is 0.723, The False Positive Rate is 0.121,
    The True Negative Rate is 0.879, and the False Negative Rate is 0.277
    


Unnamed: 0,precision,recall,f1-score,support
0,0.835913,0.879479,0.857143,307.0
1,0.788571,0.722513,0.754098,191.0
accuracy,0.819277,0.819277,0.819277,0.819277
macro avg,0.812242,0.800996,0.805621,498.0
weighted avg,0.817756,0.819277,0.817622,498.0


In [27]:
get_metrics_bin(knn_10, X_validate, y_validate)


    The accuracy for our model is 0.6729
    The True Positive Rate is 0.561, The False Positive Rate is 0.258,
    The True Negative Rate is 0.742, and the False Negative Rate is 0.439
    


Unnamed: 0,precision,recall,f1-score,support
0,0.731343,0.742424,0.736842,132.0
1,0.575,0.560976,0.567901,82.0
accuracy,0.672897,0.672897,0.672897,0.672897
macro avg,0.653172,0.6517,0.652372,214.0
weighted avg,0.671436,0.672897,0.672108,214.0


### 5. Run through setps 2-4 setting k to 20

In [24]:
knn_20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')

In [25]:
knn_20.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=20)

In [26]:
get_metrics_bin(knn_20, X_train, y_train)


    The accuracy for our model is 0.7871
    The True Positive Rate is 0.644, The False Positive Rate is 0.124,
    The True Negative Rate is 0.876, and the False Negative Rate is 0.356
    


Unnamed: 0,precision,recall,f1-score,support
0,0.79822,0.876221,0.835404,307.0
1,0.763975,0.643979,0.698864,191.0
accuracy,0.787149,0.787149,0.787149,0.787149
macro avg,0.781097,0.7601,0.767134,498.0
weighted avg,0.785086,0.787149,0.783036,498.0


In [28]:
get_metrics_bin(knn_20, X_validate, y_validate)


    The accuracy for our model is 0.6869
    The True Positive Rate is 0.512, The False Positive Rate is 0.205,
    The True Negative Rate is 0.795, and the False Negative Rate is 0.488
    


Unnamed: 0,precision,recall,f1-score,support
0,0.724138,0.795455,0.758123,132.0
1,0.608696,0.512195,0.556291,82.0
accuracy,0.686916,0.686916,0.686916,0.686916
macro avg,0.666417,0.653825,0.657207,214.0
weighted avg,0.679903,0.686916,0.680785,214.0


### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

>The overall accuracy increases, but the true positive rate decreases and the false negative rate increases. The lower the k value model performs better in terms of accuracy since it isn't taking as many neighbors into account (ie the data point should actually be classified as its closest 3 neighbors, but if k is set to 20, the other 17 closest neighbors are of a different class).

### 7. Which model performs best on our out-of-sample data from validate?

> The lower k models still perform better on out-of-sample data

# Logistic Regression Exercises

### 1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [2]:
logistic_titanic_df = get_titanic_data()

In [3]:
logistic_titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    891 non-null    int64  
 1   passenger_id  891 non-null    int64  
 2   survived      891 non-null    int64  
 3   pclass        891 non-null    int64  
 4   sex           891 non-null    object 
 5   age           714 non-null    float64
 6   sibsp         891 non-null    int64  
 7   parch         891 non-null    int64  
 8   fare          891 non-null    float64
 9   embarked      889 non-null    object 
 10  class         891 non-null    object 
 11  deck          203 non-null    object 
 12  embark_town   889 non-null    object 
 13  alone         891 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 97.6+ KB


In [4]:
avg_age = round(logistic_titanic_df.age.mean(), 1)
avg_age

29.7

In [5]:
logistic_titanic_df.age.fillna(avg_age, inplace=True)

In [6]:
logistic_titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    891 non-null    int64  
 1   passenger_id  891 non-null    int64  
 2   survived      891 non-null    int64  
 3   pclass        891 non-null    int64  
 4   sex           891 non-null    object 
 5   age           891 non-null    float64
 6   sibsp         891 non-null    int64  
 7   parch         891 non-null    int64  
 8   fare          891 non-null    float64
 9   embarked      889 non-null    object 
 10  class         891 non-null    object 
 11  deck          203 non-null    object 
 12  embark_town   889 non-null    object 
 13  alone         891 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 97.6+ KB


In [7]:
baseline_df = logistic_titanic_df.copy()

In [8]:
baseline_df.survived.mean()

0.3838383838383838

In [9]:
baseline_df['baseline_survied'] = 0

In [10]:
(baseline_df.survived == baseline_df.baseline_survied).mean()

0.6161616161616161

In [13]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
    return train, validate, test

In [14]:
train, validate, test = split(logistic_titanic_df, stratify_by='survived')

In [15]:
drop_columns = ['survived', 'Unnamed: 0', 'passenger_id', 'sex', 'sibsp', 'parch', 'embarked', 'class', 'deck',
                'embark_town', 'alone']
X_train = train.drop(columns=drop_columns)
y_train = train.survived

X_validate = validate.drop(columns=drop_columns)
y_validate = validate.survived

X_test = test.drop(columns=drop_columns)
y_test = test.survived

In [17]:
model_1 = LogisticRegression(C=1, random_state=123, intercept_scaling=1, solver='lbfgs')

In [18]:
model_1.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [19]:
print(model_1.coef_, model_1.intercept_)

[[-0.94961473 -0.030634    0.00141005]] [2.52863672]


In [20]:
get_metrics_bin(model_1, X_train, y_train)


    The accuracy for our model is 0.7028
    The True Positive Rate is 0.44, The False Positive Rate is 0.134,
    The True Negative Rate is 0.866, and the False Negative Rate is 0.56
    


Unnamed: 0,precision,recall,f1-score,support
0,0.713137,0.86645,0.782353,307.0
1,0.672,0.439791,0.531646,191.0
accuracy,0.702811,0.702811,0.702811,0.702811
macro avg,0.692568,0.65312,0.656999,498.0
weighted avg,0.697359,0.702811,0.686198,498.0


> This model does not perform better than the baseline accuracy.

### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [21]:
X_train = pd.concat([X_train, train.sex], axis=1)

In [22]:
X_train = pd.get_dummies(X_train, columns=['sex'], dummy_na=False, drop_first=True)

In [23]:
X_train.head()

Unnamed: 0,pclass,age,fare,sex_male
583,1,36.0,40.125,1
165,3,9.0,20.525,1
50,3,7.0,39.6875,1
259,2,50.0,26.0,0
306,1,29.7,110.8833,0


In [24]:
model_2 = LogisticRegression(C=1, random_state=123, intercept_scaling=1, solver='lbfgs')
model_2.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [25]:
print(model_2.coef_, model_2.intercept_)

[[-1.21048605e+00 -2.97259690e-02 -2.02980289e-03 -2.71609274e+00]] [4.84166937]


In [26]:
get_metrics_bin(model_2, X_train, y_train)


    The accuracy for our model is 0.8133
    The True Positive Rate is 0.728, The False Positive Rate is 0.134,
    The True Negative Rate is 0.866, and the False Negative Rate is 0.272
    


Unnamed: 0,precision,recall,f1-score,support
0,0.836478,0.86645,0.8512,307.0
1,0.772222,0.727749,0.749326,191.0
accuracy,0.813253,0.813253,0.813253,0.813253
macro avg,0.80435,0.797099,0.800263,498.0
weighted avg,0.811834,0.813253,0.812128,498.0


### 3. Try out other combinations of features and models.

In [27]:
# Same features, different C value
model_3 = LogisticRegression(C=.1, random_state=123, intercept_scaling=1, solver='lbfgs')
model_3.fit(X_train, y_train)

LogisticRegression(C=0.1, random_state=123)

In [28]:
print(model_3.coef_, model_3.intercept_)

[[-8.45821898e-01 -2.38018637e-02  7.45500349e-04 -1.80667592e+00]] [3.2277423]


In [29]:
get_metrics_bin(model_3, X_train, y_train)


    The accuracy for our model is 0.7831
    The True Positive Rate is 0.618, The False Positive Rate is 0.114,
    The True Negative Rate is 0.886, and the False Negative Rate is 0.382
    


Unnamed: 0,precision,recall,f1-score,support
0,0.788406,0.885993,0.834356,307.0
1,0.771242,0.617801,0.686047,191.0
accuracy,0.783133,0.783133,0.783133,0.783133
macro avg,0.779824,0.751897,0.760201,498.0
weighted avg,0.781823,0.783133,0.777474,498.0


In [30]:
# Only pclass and sex_male
X_train.drop(columns=['age', 'fare'], inplace=True)

In [31]:
model_4 = LogisticRegression(C=1, random_state=123, intercept_scaling=1, solver='lbfgs')
model_4.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [32]:
print(model_4.coef_, model_4.intercept_)

[[-0.97338767 -2.71286628]] [3.36220464]


In [33]:
get_metrics_bin(model_4, X_train, y_train)


    The accuracy for our model is 0.7992
    The True Positive Rate is 0.696, The False Positive Rate is 0.137,
    The True Negative Rate is 0.863, and the False Negative Rate is 0.304
    


Unnamed: 0,precision,recall,f1-score,support
0,0.820433,0.863192,0.84127,307.0
1,0.76,0.696335,0.726776,191.0
accuracy,0.799197,0.799197,0.799197,0.799197
macro avg,0.790217,0.779764,0.784023,498.0
weighted avg,0.797255,0.799197,0.797358,498.0


In [69]:
# logit_small_c with pclass and sex_male

In [34]:
model_5 = LogisticRegression(C=.1, random_state=123, intercept_scaling=1, solver='lbfgs')
model_5.fit(X_train, y_train)

LogisticRegression(C=0.1, random_state=123)

In [35]:
print(model_5.coef_, model_5.intercept_)

[[-0.74669135 -1.83695915]] [2.34063736]


In [36]:
get_metrics_bin(model_5, X_train, y_train)


    The accuracy for our model is 0.7992
    The True Positive Rate is 0.696, The False Positive Rate is 0.137,
    The True Negative Rate is 0.863, and the False Negative Rate is 0.304
    


Unnamed: 0,precision,recall,f1-score,support
0,0.820433,0.863192,0.84127,307.0
1,0.76,0.696335,0.726776,191.0
accuracy,0.799197,0.799197,0.799197,0.799197
macro avg,0.790217,0.779764,0.784023,498.0
weighted avg,0.797255,0.799197,0.797358,498.0


### 4. Use you best 3 models to predict and evaluate on your validate sample.

In [37]:
X_validate = pd.concat([X_validate, validate.sex], axis=1)
X_validate = pd.get_dummies(X_validate, columns=['sex'], dummy_na=False, drop_first=True)
X_validate

In [39]:
# model_2, C=1, pclass, age, fare, sex_male, ~81%
get_metrics_bin(model_2, X_validate, y_validate)


    The accuracy for our model is 0.7757
    The True Positive Rate is 0.671, The False Positive Rate is 0.159,
    The True Negative Rate is 0.841, and the False Negative Rate is 0.329
    


Unnamed: 0,precision,recall,f1-score,support
0,0.804348,0.840909,0.822222,132.0
1,0.723684,0.670732,0.696203,82.0
accuracy,0.775701,0.775701,0.775701,0.775701
macro avg,0.764016,0.75582,0.759212,214.0
weighted avg,0.773439,0.775701,0.773934,214.0


In [40]:
# model_3, C=.1, pclass, age, fare, sex_male, ~78%
get_metrics_bin(model_3, X_validate, y_validate)


    The accuracy for our model is 0.7757
    The True Positive Rate is 0.61, The False Positive Rate is 0.121,
    The True Negative Rate is 0.879, and the False Negative Rate is 0.39
    


Unnamed: 0,precision,recall,f1-score,support
0,0.783784,0.878788,0.828571,132.0
1,0.757576,0.609756,0.675676,82.0
accuracy,0.775701,0.775701,0.775701,0.775701
macro avg,0.77068,0.744272,0.752124,214.0
weighted avg,0.773741,0.775701,0.769985,214.0


In [41]:
X_validate.drop(columns=['age', 'fare'], inplace=True)

In [42]:
# model_4, C=1, plcass, sex_male, ~80%
get_metrics_bin(model_4, X_validate, y_validate)


    The accuracy for our model is 0.7617
    The True Positive Rate is 0.659, The False Positive Rate is 0.174,
    The True Negative Rate is 0.826, and the False Negative Rate is 0.341
    


Unnamed: 0,precision,recall,f1-score,support
0,0.79562,0.825758,0.810409,132.0
1,0.701299,0.658537,0.679245,82.0
accuracy,0.761682,0.761682,0.761682,0.761682
macro avg,0.74846,0.742147,0.744827,214.0
weighted avg,0.759478,0.761682,0.76015,214.0


### 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [43]:
X_test = pd.concat([X_test, test.sex], axis=1)
X_test = pd.get_dummies(X_test, columns=['sex'], dummy_na=False, drop_first=True)
X_test

Unnamed: 0,pclass,age,fare,sex_male
561,3,40.0,7.8958,1
641,1,24.0,69.3000,0
400,3,39.0,7.9250,1
498,1,25.0,151.5500,0
875,3,15.0,7.2250,0
...,...,...,...,...
339,1,45.0,35.5000,1
841,2,16.0,10.5000,1
442,3,25.0,7.7750,1
815,1,29.7,0.0000,1


In [44]:
get_metrics_bin(model_2, X_test, y_test)


    The accuracy for our model is 0.8045
    The True Positive Rate is 0.71, The False Positive Rate is 0.136,
    The True Negative Rate is 0.864, and the False Negative Rate is 0.29
    


Unnamed: 0,precision,recall,f1-score,support
0,0.826087,0.863636,0.844444,110.0
1,0.765625,0.710145,0.736842,69.0
accuracy,0.804469,0.804469,0.804469,0.804469
macro avg,0.795856,0.786891,0.790643,179.0
weighted avg,0.80278,0.804469,0.802966,179.0


> Performs slightly better on test data than validate or training data.