### Model evaluation for logistic regression
Model metrics with sklearn for logistic regression on the Titanic dataset

***
#### Environment
`conda activate sklearn-env`


***
#### Goals
- Build a pipeline
- Use the pipeline to transform data
- Use the pipeline to predict
- Compute various metrics specific to classification models 

***
#### References

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

https://scikit-learn.org/stable/modules/model_evaluation.html



In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random


# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

#### Dataset load from CSV located on UCI website.

http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data  
If the URL does not work the dataset can be loaded from the data folder `./data/auto-mpg.data`.

In [None]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/40945
raw_dataset = fetch_openml("titanic", version=1, as_frame=True).frame
dataset = raw_dataset.copy()
dataset.head(10)

In [None]:
dataset.drop(['boat', 'body', 'home.dest', 'fare', 'cabin'],  axis=1, inplace=True)

### Dataset split
- row base in test and train datasets
- column base in features and labels

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=random.randint(0, 1000))
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('survived')
test_labels = test_features.pop('survived')

test_labels.reset_index(drop=True, inplace=True)

#### Build the pipeline

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline

numeric_features = ['age', 'sibsp', 'parch']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

custom_features = ['pclass']
custom_transformer = FunctionTransformer(np.square, validate=True)

categorical_features = ['embarked', 'sex']
ohe_transformer = OneHotEncoder(handle_unknown='ignore')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', ohe_transformer)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ohe', categorical_transformer, categorical_features),
        ('cust', custom_transformer, custom_features)])

data_prep_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#### Use the pipeline to transform data

In [None]:
data_prep_data = data_prep_pipeline.fit_transform(train_features, train_labels)

transformed_df = pd.DataFrame(data = data_prep_data)
transformed_df.head(10)

#### Build the full pipeline

In [None]:
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LogisticRegression())])
pipeline_model = pipeline.fit(train_features, train_labels)
print(pipeline_model)

In [None]:
from sklearn import set_config

set_config(display='diagram')
pipeline

#### Use pipeline to predict values

In [None]:
scored_data = pipeline.predict(test_features)

#### Show predicted values

In [None]:
scored_df = pd.DataFrame(data = scored_data)

scored_df = pd.concat([scored_df, test_labels], axis=1)
scored_df.head(10)

### Model evaluation

#### Confusion matrix
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [None]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(test_labels, scored_data).ravel()

#### Plot Confusion Matrix
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [None]:
from sklearn.metrics import plot_confusion_matrix

def plotConfusionMatrix(test_features, test_labels, pipeline):
    title = "Confusion matrix"
    
    disp = plot_confusion_matrix(pipeline, test_features, test_labels,
                                 display_labels=['Survived', 'Not survived'],
                                 cmap=plt.cm.Blues)
    disp.ax_.set_title(title)
    
    print(title)
    print(disp.confusion_matrix)
    
    plt.show()

In [None]:
plotConfusionMatrix(test_features, test_labels, pipeline)

#### Metrics 
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html  
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html  
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html  
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html  
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html

In [None]:
def showMetrics(test_features, test_labels, scored_data, pipeline):
    from sklearn.metrics import accuracy_score
    print("Accuracy: ", accuracy_score(test_labels, scored_data))
    
    from sklearn.metrics import precision_score
    print( "Precission", precision_score(test_labels, scored_data, average='binary', pos_label='1'))
    
    from sklearn.metrics import recall_score
    print( "Sensitivity: ", recall_score(test_labels, scored_data, average='binary', pos_label='1'))

    print("Specify", tn / (tn+fp))
    
    from sklearn.metrics import f1_score
    print( "F1 score", f1_score(test_labels, scored_data, average='binary', pos_label='1'))
    
    from sklearn.metrics import roc_auc_score
    print( "AuROC", roc_auc_score(test_labels, pipeline.predict_proba(test_features)[:, 1]))

    from sklearn.metrics import log_loss
    print( "Cross-entropy loss", log_loss(test_labels, pipeline.predict_proba(test_features)[:, 1]))

    log_loss
    

In [None]:
showMetrics(test_features, test_labels, scored_data, pipeline)

#### Roc Curve - plot

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.RocCurveDisplay.html
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_roc_curve.html

In [None]:
import matplotlib.pyplot as plt  
import numpy as np
from sklearn import metrics

def plotAuROC(test_features, test_labels, pipeline):
    fpr, tpr, thresholds = metrics.roc_curve(test_labels, pipeline.predict_proba(test_features)[:, 1], pos_label='1')
    roc_auc = metrics.auc(fpr, tpr)
    display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='Logistic regression estimator')
    display.plot()  
    plt.show()      

In [None]:
plotAuROC(test_features, test_labels, pipeline)

### Build basic model without feature engineering and compare metrics

In [None]:
train_dataset_new = train_dataset[['pclass','sibsp','parch', 'survived' ]].copy()
test_dataset_new = test_dataset[['pclass','sibsp','parch', 'survived' ]].copy()

train_features_new = train_dataset_new.copy()
test_features_new = test_dataset_new.copy()

train_labels_new = train_features_new.pop('survived')
test_labels_new = test_features_new.pop('survived')

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regressor_new = LogisticRegression().fit(train_features_new, train_labels_new)
scored_data_new = logistic_regressor_new.predict(test_features_new)

In [None]:
tn, fp, fn, tp = confusion_matrix(test_labels_new, scored_data_new).ravel()

In [None]:
plotConfusionMatrix(test_features_new, test_labels_new, logistic_regressor_new)

In [None]:
showMetrics(test_features_new, test_labels_new, scored_data_new, logistic_regressor_new)

In [None]:
plotAuROC(test_features_new, test_labels_new, logistic_regressor_new)