In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

import acquire as acq
import prepare as prep
import my_model as m

np.random.seed(42)

# Exercises

In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Create a new notebook, logistic_regression, use it to answer the following questions:

1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

3. Try out other combinations of features and models.

4. Use you best 3 models to predict and evaluate on your validate sample.

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

- Bonus1 How do different strategies for handling the missing values in the age column affect model performance?

- Bonus2: How do different strategies for encoding sex affect model performance?

- Bonus3: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

    - Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected. C=.01,.1,1,10,100,1000

In [2]:
df = acq.get_titanic_data()

csv file found and read


In [3]:
df.age.info()
# there are 177 null values in age. This time around, I will keep age and use SimpleImputer to impute values
891-714

<class 'pandas.core.series.Series'>
RangeIndex: 891 entries, 0 to 890
Series name: age
Non-Null Count  Dtype  
--------------  -----  
714 non-null    float64
dtypes: float64(1)
memory usage: 7.1 KB


177

In [None]:
# defining a second prepare function to keep age
def prep_titanic_2(df):
    """
    This function will
    - take in the titanic dataframe
    - clean it up 
        -- remove useless columns passenger_id, class, embark_town, deck
        -- NOTE: keeping the age column
        -- tack on dummies columns for 'pclass', 'sex', 'embarked'
    - returns cleaned up dataframe
    """
    df = df.drop(columns=['passenger_id', 'class', 'embark_town', 'deck'])
    df.embarked = df.embarked.fillna('S')
    dummies_df = pd.get_dummies(df[['sex','embarked']], drop_first=True)
    new_df = pd.concat([df, dummies_df], axis=1)
    return new_df

In [4]:
df = prep.prep_titanic_2(df) # _2 keeps age
df = prep.prep_titanic_for_model(df)

In [5]:
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


In [6]:
target = 'survived'
train, validate, test = prep.split_function(df, target)

Prepared df: (891, 10)

Train: (534, 10)
Validate: (178, 10)
Test: (179, 10)


In [7]:
# I'll use median as the strategy to fill nan values in age
# this function now in prepare
def impute_feature(train, validate, test, feature='age', strat='median'):
    """
    This function will
    - take in train, validate, test dfs
    - take in a string which is the column name that has nan values
        -- default is 'age' (built off titanic df)
    - take in a string which is the strategy to impute values
        -- default is 'median'
    - impute nan values in the feature(age) column and fill with new values
    - return train, validate, test with imputed values
    """
    imputer = SimpleImputer(missing_values=np.nan, strategy=strat)
    imputer = imputer.fit(train[[feature]])
    train[[feature]] = imputer.transform(train[[feature]])
    validate[[feature]] = imputer.transform(validate[[feature]])
    test[[feature]] = imputer.transform(test[[feature]])
    
    return train, validate, test

In [8]:
train, validate, test = prep.impute_feature(train, validate, test)

In [9]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
577,1,1,39.0,1,0,55.9,0,0,0,1
63,0,3,4.0,3,2,27.9,0,1,0,1
424,0,3,18.0,1,1,20.2125,0,1,0,1
513,1,1,54.0,1,0,59.4,0,0,0,0
610,0,3,39.0,1,5,31.275,0,0,0,1


In [10]:
X_train, X_validate, X_test, y_train, y_validate, y_test, baseline_accuracy = (
    m.get_X_y_baseline(train, validate, test, target)
    )

In [11]:
baseline_accuracy

0.6161048689138576

In [12]:
# first send in everything. - make/fit/use the thing
logit1 = LogisticRegression()
logit1.fit(X_train, y_train)
y_pred1 = logit1.predict(X_train)
m.get_tree_metrics(y_train, y_pred1)

CONFUSION MATRIX
          0_predicted  1_predicted
0_actual          282           47
1_actual           60          145

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       329
           1       0.76      0.71      0.73       205

    accuracy                           0.80       534
   macro avg       0.79      0.78      0.79       534
weighted avg       0.80      0.80      0.80       534

Accuracy: 0.799625468164794

True Positive Rate/Sensitivity/Recall/Power: 0.7073170731707317
False Positive Rate/False Alarm Ratio/Fall-out: 0.14285714285714285
True Negative Rate/Specificity/Selectivity: 0.8571428571428571
False Negative Rate/Miss Rate: 0.2926829268292683

Precision/PPV: 0.7552083333333334
F1 Score: 0.730478589420655

Support (0): 205
Support (1): 329


(282, 47, 60, 145)

In [13]:
# check vs validate
y_pred_v1 = logit1.predict(X_validate)
m.get_tree_metrics(y_validate, y_pred_v1)

CONFUSION MATRIX
          0_predicted  1_predicted
0_actual           94           16
1_actual           21           47

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       110
           1       0.75      0.69      0.72        68

    accuracy                           0.79       178
   macro avg       0.78      0.77      0.78       178
weighted avg       0.79      0.79      0.79       178

Accuracy: 0.7921348314606742

True Positive Rate/Sensitivity/Recall/Power: 0.6911764705882353
False Positive Rate/False Alarm Ratio/Fall-out: 0.14545454545454545
True Negative Rate/Specificity/Selectivity: 0.8545454545454545
False Negative Rate/Miss Rate: 0.3088235294117647

Precision/PPV: 0.746031746031746
F1 Score: 0.717557251908397

Support (0): 68
Support (1): 110


(94, 16, 21, 47)

In [14]:
# now try only sending in survived, age, far, and pclass

In [15]:
# remove all but survived, age, fare, and pclass
train, validate, test = prep.split_function(df[['survived','pclass','age','fare']], target)
train, validate, test = prep.impute_feature(train, validate, test)

Prepared df: (891, 4)

Train: (534, 4)
Validate: (178, 4)
Test: (179, 4)


In [16]:
X_train, X_validate, X_test, y_train, y_validate, y_test, baseline_accuracy = (
    m.get_X_y_baseline(train, validate, test, target)
    )
# From review: another way is to set features1 = ['pclass','age','fare'], then
# make X_train with [features1] (or features2, etc.)
# rather than make new X_trains every time

In [17]:
# make/fit/use the thing (LogisticRegression model)
logit2 = LogisticRegression()
logit2.fit(X_train, y_train)

In [18]:
y_pred2 = logit2.predict(X_train)

In [19]:
m.get_tree_metrics(y_train, y_pred2)

CONFUSION MATRIX
          0_predicted  1_predicted
0_actual          290           39
1_actual          115           90

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.88      0.79       329
           1       0.70      0.44      0.54       205

    accuracy                           0.71       534
   macro avg       0.71      0.66      0.66       534
weighted avg       0.71      0.71      0.69       534

Accuracy: 0.7116104868913857

True Positive Rate/Sensitivity/Recall/Power: 0.43902439024390244
False Positive Rate/False Alarm Ratio/Fall-out: 0.11854103343465046
True Negative Rate/Specificity/Selectivity: 0.8814589665653495
False Negative Rate/Miss Rate: 0.5609756097560976

Precision/PPV: 0.6976744186046512
F1 Score: 0.5389221556886228

Support (0): 205
Support (1): 329


(290, 39, 115, 90)

In [20]:
# check vs validate
y_pred_v2 = logit2.predict(X_validate)
m.get_tree_metrics(y_validate, y_pred_v2)

CONFUSION MATRIX
          0_predicted  1_predicted
0_actual           93           17
1_actual           37           31

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.85      0.78       110
           1       0.65      0.46      0.53        68

    accuracy                           0.70       178
   macro avg       0.68      0.65      0.65       178
weighted avg       0.69      0.70      0.68       178

Accuracy: 0.6966292134831461

True Positive Rate/Sensitivity/Recall/Power: 0.45588235294117646
False Positive Rate/False Alarm Ratio/Fall-out: 0.15454545454545454
True Negative Rate/Specificity/Selectivity: 0.8454545454545455
False Negative Rate/Miss Rate: 0.5441176470588235

Precision/PPV: 0.6458333333333334
F1 Score: 0.5344827586206896

Support (0): 68
Support (1): 110


(93, 17, 37, 31)

In [21]:
baseline_accuracy

0.6161048689138576

In [22]:
# so, sending in everything had an accuracy of .80/.80 (train/validate) and only sending in age, fare, and pclass 
# yields accuracy of .71/.70 which is still better than baseline, but not as good as sending in everything.

In [23]:
# now we will include sex
# remove all but survived, age, fare, sex, and pclass
train, validate, test = prep.split_function(df[['survived','pclass','age','fare', 'sex_male']], target)
train, validate, test = prep.impute_feature(train, validate, test)

Prepared df: (891, 5)

Train: (534, 5)
Validate: (178, 5)
Test: (179, 5)


In [24]:
X_train, X_validate, X_test, y_train, y_validate, y_test, baseline_accuracy = (
    m.get_X_y_baseline(train, validate, test, target)
    )

In [25]:
# make/fit/use the thing (LogisticRegression model)
logit3 = LogisticRegression()
logit3.fit(X_train, y_train)
y_pred3 = logit3.predict(X_train)
m.get_tree_metrics(y_train, y_pred3)

CONFUSION MATRIX
          0_predicted  1_predicted
0_actual          279           50
1_actual           58          147

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       329
           1       0.75      0.72      0.73       205

    accuracy                           0.80       534
   macro avg       0.79      0.78      0.78       534
weighted avg       0.80      0.80      0.80       534

Accuracy: 0.797752808988764

True Positive Rate/Sensitivity/Recall/Power: 0.7170731707317073
False Positive Rate/False Alarm Ratio/Fall-out: 0.1519756838905775
True Negative Rate/Specificity/Selectivity: 0.8480243161094225
False Negative Rate/Miss Rate: 0.28292682926829266

Precision/PPV: 0.7461928934010152
F1 Score: 0.7313432835820894

Support (0): 205
Support (1): 329


(279, 50, 58, 147)

In [26]:
# check vs validate
y_pred_v3 = logit3.predict(X_validate)
m.get_tree_metrics(y_validate, y_pred_v3)

CONFUSION MATRIX
          0_predicted  1_predicted
0_actual           92           18
1_actual           20           48

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       110
           1       0.73      0.71      0.72        68

    accuracy                           0.79       178
   macro avg       0.77      0.77      0.77       178
weighted avg       0.79      0.79      0.79       178

Accuracy: 0.7865168539325843

True Positive Rate/Sensitivity/Recall/Power: 0.7058823529411765
False Positive Rate/False Alarm Ratio/Fall-out: 0.16363636363636364
True Negative Rate/Specificity/Selectivity: 0.8363636363636363
False Negative Rate/Miss Rate: 0.29411764705882354

Precision/PPV: 0.7272727272727273
F1 Score: 0.7164179104477613

Support (0): 68
Support (1): 110


(92, 18, 20, 48)

In [27]:
# including sex gets the accuracy back up to .8/.79

In [28]:
# Next question: try out other combinations of features and models
# I'm going to try one more: just sending in sex and alone
train, validate, test = prep.split_function(df[['survived','alone', 'sex_male']], target)
# train, validate, test = prep.impute_feature(train, validate, test) -- unneccessary
X_train, X_validate, X_test, y_train, y_validate, y_test, baseline_accuracy = (
    m.get_X_y_baseline(train, validate, test, target)
    )
# make/fit/use the thing (LogisticRegression model)
logit4 = LogisticRegression()
logit4.fit(X_train, y_train)
y_pred4 = logit4.predict(X_train)
m.get_tree_metrics(y_train, y_pred4)

Prepared df: (891, 3)

Train: (534, 3)
Validate: (178, 3)
Test: (179, 3)
CONFUSION MATRIX
          0_predicted  1_predicted
0_actual          280           49
1_actual           65          140

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       329
           1       0.74      0.68      0.71       205

    accuracy                           0.79       534
   macro avg       0.78      0.77      0.77       534
weighted avg       0.78      0.79      0.78       534

Accuracy: 0.7865168539325843

True Positive Rate/Sensitivity/Recall/Power: 0.6829268292682927
False Positive Rate/False Alarm Ratio/Fall-out: 0.14893617021276595
True Negative Rate/Specificity/Selectivity: 0.851063829787234
False Negative Rate/Miss Rate: 0.3170731707317073

Precision/PPV: 0.7407407407407407
F1 Score: 0.7106598984771575

Support (0): 205
Support (1): 329


(280, 49, 65, 140)

In [29]:
# check vs validate
y_pred_v4 = logit4.predict(X_validate)
m.get_tree_metrics(y_validate, y_pred_v4)

CONFUSION MATRIX
          0_predicted  1_predicted
0_actual           94           16
1_actual           20           48

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       110
           1       0.75      0.71      0.73        68

    accuracy                           0.80       178
   macro avg       0.79      0.78      0.78       178
weighted avg       0.80      0.80      0.80       178

Accuracy: 0.797752808988764

True Positive Rate/Sensitivity/Recall/Power: 0.7058823529411765
False Positive Rate/False Alarm Ratio/Fall-out: 0.14545454545454545
True Negative Rate/Specificity/Selectivity: 0.8545454545454545
False Negative Rate/Miss Rate: 0.29411764705882354

Precision/PPV: 0.75
F1 Score: 0.7272727272727272

Support (0): 68
Support (1): 110


(94, 16, 20, 48)

In [30]:
# accuracy is now .79/.80 (train/validate). Now I'm curious what the acc would be if I only send in sex.

In [31]:
# I'm going to try one more: just sending in sex 
train, validate, test = prep.split_function(df[['survived', 'sex_male']], target)
# train, validate, test = prep.impute_feature(train, validate, test) -- unneccessary
X_train, X_validate, X_test, y_train, y_validate, y_test, baseline_accuracy = (
    m.get_X_y_baseline(train, validate, test, target)
    )
# make/fit/use the thing (LogisticRegression model)
logit5 = LogisticRegression()
logit5.fit(X_train, y_train)
y_pred5 = logit5.predict(X_train)
m.get_tree_metrics(y_train, y_pred5)

Prepared df: (891, 2)

Train: (534, 2)
Validate: (178, 2)
Test: (179, 2)
CONFUSION MATRIX
          0_predicted  1_predicted
0_actual          280           49
1_actual           65          140

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       329
           1       0.74      0.68      0.71       205

    accuracy                           0.79       534
   macro avg       0.78      0.77      0.77       534
weighted avg       0.78      0.79      0.78       534

Accuracy: 0.7865168539325843

True Positive Rate/Sensitivity/Recall/Power: 0.6829268292682927
False Positive Rate/False Alarm Ratio/Fall-out: 0.14893617021276595
True Negative Rate/Specificity/Selectivity: 0.851063829787234
False Negative Rate/Miss Rate: 0.3170731707317073

Precision/PPV: 0.7407407407407407
F1 Score: 0.7106598984771575

Support (0): 205
Support (1): 329


(280, 49, 65, 140)

In [32]:
# check vs validate
y_pred_v5 = logit5.predict(X_validate)
m.get_tree_metrics(y_validate, y_pred_v5)

CONFUSION MATRIX
          0_predicted  1_predicted
0_actual           94           16
1_actual           20           48

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       110
           1       0.75      0.71      0.73        68

    accuracy                           0.80       178
   macro avg       0.79      0.78      0.78       178
weighted avg       0.80      0.80      0.80       178

Accuracy: 0.797752808988764

True Positive Rate/Sensitivity/Recall/Power: 0.7058823529411765
False Positive Rate/False Alarm Ratio/Fall-out: 0.14545454545454545
True Negative Rate/Specificity/Selectivity: 0.8545454545454545
False Negative Rate/Miss Rate: 0.29411764705882354

Precision/PPV: 0.75
F1 Score: 0.7272727272727272

Support (0): 68
Support (1): 110


(94, 16, 20, 48)

In [33]:
# HA! it's .79/.80 (train/validate); justing using sex is almost as good as sending in everything.

In [34]:
# So now, I'll check these models vs validate
# I went back and added get_tree_metrics to the code above but called it with validate
#... looks like the best model is the one where I sent in everything, i.e. #1  
# Now I'll run that model on the test data set.

In [35]:
train, validate, test = prep.split_function(df, target)
train, validate, test = prep.impute_feature(train, validate, test)
X_train, X_validate, X_test, y_train, y_validate, y_test, baseline_accuracy = (
    m.get_X_y_baseline(train, validate, test, target)
    )
logit1 = LogisticRegression()
logit1.fit(X_train, y_train)
y_pred1 = logit1.predict(X_test)
m.get_tree_metrics(y_test, y_pred1)

Prepared df: (891, 10)

Train: (534, 10)
Validate: (178, 10)
Test: (179, 10)
CONFUSION MATRIX
          0_predicted  1_predicted
0_actual           96           14
1_actual           22           47

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       110
           1       0.77      0.68      0.72        69

    accuracy                           0.80       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179

Accuracy: 0.7988826815642458

True Positive Rate/Sensitivity/Recall/Power: 0.6811594202898551
False Positive Rate/False Alarm Ratio/Fall-out: 0.12727272727272726
True Negative Rate/Specificity/Selectivity: 0.8727272727272727
False Negative Rate/Miss Rate: 0.3188405797101449

Precision/PPV: 0.7704918032786885
F1 Score: 0.7230769230769231

Support (0): 69
Support (1): 110


(96, 14, 22, 47)

### running best model on test data yields accuracy of .8

## Bonus1 How do different strategies for handling the missing values in the age column affect model performance?

In [36]:
# I'll try mean first. mean and median were very close, so I don't expect much difference.
train, validate, test = prep.split_function(df, target)
train, validate, test = prep.impute_feature(train, validate, test, strat='mean')
X_train, X_validate, X_test, y_train, y_validate, y_test, baseline_accuracy = (
    m.get_X_y_baseline(train, validate, test, target)
    )
logit1 = LogisticRegression()
logit1.fit(X_train, y_train)
y_pred1 = logit1.predict(X_test)
m.get_tree_metrics(y_test, y_pred1)

Prepared df: (891, 10)

Train: (534, 10)
Validate: (178, 10)
Test: (179, 10)
CONFUSION MATRIX
          0_predicted  1_predicted
0_actual           94           16
1_actual           23           46

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.85      0.83       110
           1       0.74      0.67      0.70        69

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.77       179
weighted avg       0.78      0.78      0.78       179

Accuracy: 0.7821229050279329

True Positive Rate/Sensitivity/Recall/Power: 0.6666666666666666
False Positive Rate/False Alarm Ratio/Fall-out: 0.14545454545454545
True Negative Rate/Specificity/Selectivity: 0.8545454545454545
False Negative Rate/Miss Rate: 0.3333333333333333

Precision/PPV: 0.7419354838709677
F1 Score: 0.7022900763358778

Support (0): 69
Support (1): 110


(94, 16, 23, 46)

In [37]:
# mean brought the accuracy down to .78

In [45]:
# next I'll try mode
train, validate, test = prep.split_function(df, target)
train, validate, test = prep.impute_feature(train, validate, test, strat="most_frequent")
X_train, X_validate, X_test, y_train, y_validate, y_test, baseline_accuracy = (
    m.get_X_y_baseline(train, validate, test, target)
    )
logit1 = LogisticRegression()
logit1.fit(X_train, y_train)
y_pred1 = logit1.predict(X_test)
m.get_tree_metrics(y_test, y_pred1)

Prepared df: (891, 10)

Train: (534, 10)
Validate: (178, 10)
Test: (179, 10)
CONFUSION MATRIX
          0_predicted  1_predicted
0_actual           93           17
1_actual           20           49

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       110
           1       0.74      0.71      0.73        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

Accuracy: 0.7932960893854749

True Positive Rate/Sensitivity/Recall/Power: 0.7101449275362319
False Positive Rate/False Alarm Ratio/Fall-out: 0.15454545454545454
True Negative Rate/Specificity/Selectivity: 0.8454545454545455
False Negative Rate/Miss Rate: 0.2898550724637681

Precision/PPV: 0.7424242424242424
F1 Score: 0.725925925925926

Support (0): 69
Support (1): 110


(93, 17, 20, 49)

In [39]:
# most_frequent dropped the accuracy to .79

## Bonus2: How do different strategies for encoding sex affect model performance?

In [40]:
# I don't understand what other strategies there might be for encoding a category with a 0 or a 1 for values.
# I suppose you could make it "sex_female" instead of "sex_male", but that should have no effect. I'll try it
# if I get time.

## Bonus3: 

scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

    Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected. 
    
        C=.01,.1,1,10,100,1000



In [46]:
df = acq.get_titanic_data()
df = prep.prep_titanic_2(df) #keeping age
df = prep.prep_titanic_for_model(df)

csv file found and read


In [47]:
target = 'survived'
train, test, validate = prep.split_function(df, target)

Prepared df: (891, 10)

Train: (534, 10)
Validate: (178, 10)
Test: (179, 10)


In [48]:
train, test, validate = prep.impute_feature(train, test, validate, feature='age', strat='median')

In [49]:
X_train, X_validate, X_test, y_train, y_validate, y_test, baseline_accuracy = (
    m.get_X_y_baseline(train, validate, test, target)
    )

In [108]:
# check the LogisticRegression model with different c values 
# initialize range of C values and a dictionary to store things in
C_values = [.01, .1, 1, 10, 100, 1000]
results = []
cols = ['C','train_acc','val_acc']
coef_cols = ['coef_' + c for c in X_train.columns]
results_df = pd.DataFrame(cols + coef_cols).T

for x in C_values:
    logit = LogisticRegression(C=x)
    logit.fit(X_train, y_train)
#    y_pred = logit.predict(X_train)
#    m.get_tree_metrics(y_test, y_pred)
    train_acc = logit.score(X_train, y_train)
    val_acc = logit.score(X_validate, y_validate)

    test = np.array([x, train_acc, val_acc])
    test_coef = logit.coef_
    combo_array = np.concatenate((test, test_coef[0]))
    new_df = pd.DataFrame(combo_array)
    results_df = pd.concat((results_df, new_df.T), axis=0)

#     print(f'For C = {x}, the train/validate accuracy is {train_acc} / {val_acc}.')
#     print(f'AND, the model coefficients are: {logit.coef_}.')
    
# print out dataframe with C values, train and val accuracies, and the coefficients for each feature    
results_df    
    


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,C,train_acc,val_acc,coef_pclass,coef_age,coef_sibsp,coef_parch,coef_fare,coef_alone,coef_sex_male,coef_embarked_Q,coef_embarked_S
0,0.01,0.737828,0.703911,-0.247513,-0.026858,-0.132036,0.019531,0.015939,-0.096574,-0.441744,0.052507,-0.090043
0,0.1,0.808989,0.793296,-0.786977,-0.033065,-0.226057,-0.052152,0.006609,-0.325945,-1.653963,0.328944,-0.211307
0,1.0,0.799625,0.798883,-1.089572,-0.039985,-0.268472,-0.142915,0.004208,-0.328858,-2.441633,1.008116,-0.084958
0,10.0,0.801498,0.798883,-1.178705,-0.038797,-0.271719,-0.130528,0.003651,-0.336175,-2.552949,1.185038,-0.074542
0,100.0,0.801498,0.804469,-1.154457,-0.042974,-0.356735,-0.190279,0.004649,-0.442668,-2.628522,1.09627,0.011479
0,1000.0,0.807116,0.798883,-1.200483,-0.041705,-0.315424,-0.204639,0.003467,-0.547029,-2.614819,1.110982,-0.081801
