# Classification Workflow

In [None]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import MissingIndicator, SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, RocCurveDisplay, confusion_matrix, roc_curve

# Objectives

> The goal here is to illustrate a possible workflow for classification modeling with `sklearn`'s `LogisticRegression` model.


- Formulate and implement an iterative modeling workflow
- Implement pipeline workflow to improve efficiency

# Modeling Walkthrough

## Modeling Steps

1. Build a model based on the [Titanic dataset](https://www.kaggle.com/c/titanic/data) that predicts whether a given person survived or not
2. Evaluate the performance of the model
3. Make changes in an attempt to improve the model
4. Demonstrate whether an improvement was made

## The Data

This dataset has the following columns:

| Variable | Definition | Key |
| -------- | ---------- | --- |
| survival | Survival | 0 = No, 1 = Yes |
| pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
| sex | Sex | |
| Age | Age in years | |
| sibsp | # of siblings / spouses aboard the Titanic | |
| parch | # of parents / children aboard the Titanic | |
| ticket | Ticket number | |
| fare | Passenger fare | |
| cabin | Cabin number | |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |

## Initial Data Understanding and Preparation

Open up the file, get everything into `X` features and `y` target variables, divided into train and test.

In [None]:
df = pd.read_csv("data/titanic.csv")

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isna().sum()

Age data is missing for about 1 in 5 rows in our dataset.  For now, let's just exclude it, plus the non-numeric columns, and `PassengerId`, which doesn't seem like a real feature, but rather just an artifact of the dataset.

In [None]:
df = df.drop("PassengerId", axis=1)

In [None]:
df.dtypes

In [None]:
# Pclass are numbers but it's not clear that the difference between 1st and 2nd is the
# same as the difference between 2nd and 3rd
numeric_columns = ["Survived", "SibSp", "Parch", "Fare"]

In [None]:
sns.pairplot(df[numeric_columns]);

In [None]:
numeric_df = df[numeric_columns]
X = numeric_df.drop("Survived", axis=1)
y = numeric_df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2023)

# 1st Model - "Dummy" Model

Let's start with a completely "dummy" model, that will always choose the majority class.

In [None]:
y.value_counts(normalize=True)

In [None]:
dummy_model = DummyClassifier(strategy="most_frequent")

Fit the model on our data

In [None]:
dummy_model.fit(X_train, y_train)

We should expect all predictions to be the same

In [None]:
# just grabbing the first 50 to save space
dummy_model.predict(X_train)[:50]

In [None]:
dummy_model.score(X_train, y_train)

## Model Evaluation

Let's do some cross-validation to see how the model would do in generalizing to new data it's never seen.

In [None]:
cv_results = cross_val_score(dummy_model, X_train, y_train, cv=5)
cv_results.mean()

So, the mean accuracy is a little under 62% if we always guess the majority class.

To show the spread, let's make a convenient class that can help us organize the model and the cross-validation:

In [None]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

In [None]:
dummy_model_results = ModelWithCV(
                        model=dummy_model,
                        model_name='dummy',
                        X=X_train, 
                        y=y_train
)

In [None]:
fig, ax = plt.subplots()

ax = dummy_model_results.plot_cv(ax)
plt.tight_layout();

dummy_model_results.print_cv_summary()

In [None]:
fig, ax = plt.subplots()

fig.suptitle("Dummy Model")

ConfusionMatrixDisplay(confusion_matrix(y_train, dummy_model.predict(X_train))).plot(ax=ax);

A pretty lopsided confusion matrix!

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, dummy_model.predict_proba(X_train)[:,1])
RocCurveDisplay(fpr=fpr, tpr=tpr).plot();

# 2nd Model - Logistic Regression

Let's use a logistic regression and compare its performance.

We're going to specifically avoid any regularization (the default) to see how the model does with little change. So we'll pass `'none'` to the `penalty` parameter to not use any regularization. 

In [None]:
simple_logreg_model = LogisticRegression(random_state=2023, penalty='none')

In [None]:
simple_logreg_model.fit(X_train, y_train)

Look at the predictions:

In [None]:
simple_logreg_model.predict(X_train)[:50]

Mixture of 1s and 0s this time

## Model Evaluation, Part 2

In [None]:
simple_logreg_results = ModelWithCV(
                        model=simple_logreg_model,
                        model_name='simple_logreg',
                        X=X_train, 
                        y=y_train
)

In [None]:
# Saving variable for convenience
model_results = simple_logreg_results

# Plot CV results
fig, ax = plt.subplots()
ax = model_results.plot_cv(ax)
plt.tight_layout();
# Print CV results
model_results.print_cv_summary()

In [None]:
simple_logreg_model.score(X_train, y_train)

So the mean accuracy is better when the model is actually taking in information from the features instead of always guessing the majority class.

In [None]:
confusion_matrix(y_train, simple_logreg_model.predict(X_train))

In [None]:
fig, ax = plt.subplots()

fig.suptitle("Logistic Regression with Numeric Features Only")

ConfusionMatrixDisplay(confusion_matrix(y_train, simple_logreg_model.predict(X_train))).plot(ax=ax);

So, in general we are not labeling many of the "not survived" passengers as "survived", but for "survived" passengers we're getting it wrong most of the time.

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, simple_logreg_model.predict_proba(X_train)[:,1])
RocCurveDisplay(fpr=fpr, tpr=tpr).plot();

This model is doing better than just choosing the most frequent class every time, but it probably could do better. 

We can say this model is likely underfitting, which means we need _more complexity_. We can add more complexity a few different ways. We'll try doing some feature engineering/data preparation.

# Back to Data Preparation

Maybe there is some useful information in the features we are not using yet.  Let's go wild and add all of them!

> Note: you can and should add features incrementally in a "real" modeling context.  The engineering effort of encoding the variables can be non-trivial!  But here let's assume that it's not too much work to encode all of them.

Start with a new train-test split that contains all of the features

In [None]:
X = df.drop("Survived", axis=1)
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2023)

In [None]:
X_train.columns

In [None]:
X_train.isna().sum()

# Better Process: Pipelines

> **Pipelines** can keep our code neat and clean all the way from gathering & cleaning our data, to creating models & fine-tuning them!

![](https://imgs.xkcd.com/comics/data_pipeline.png)

The `Pipeline` class from [Scikit-Learn's API](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) is especially convenient since it allows us to use our other Estimators that we know and love!

## Advantages of `Pipeline`

### Reduces Complexity

> You can focus on particular parts of the pipeline one at a time and debug or adjust parts as needed.

### Convenient

> The pipeline summarizes your fine-detail steps. That way you can focus on the big-picture aspects.

### Flexible

> You can use pipelines with different models and with GridSearch.

### Prevent Mistakes

> We can focus on one section at a time.
>
> We also can ensure data leakage between our training and doesn't occur between our training dataset and validation/testing datasets!

### Introducing Column Transformer

> Allows one to run different transformers on seperate columns in seamless step with each other. Let's try to add in imputation, and One Hot Encoding. We are creating 'mini' pipelines for each step.

## Handling Missing Values

Let's be extra cautious and make a separate column to indicate whether there originally was a missing value.

In our training data there are only missing values for a couple of the columns, but we can't be sure about where the test set will be missing data.

The `MissingIndicator` from `sklearn` will mark the missing values in an input array.

In [None]:
indicator_demo = MissingIndicator()

indicator_demo.fit(X_train)

indicator_demo.features_

In [None]:
indicator_demo.transform(X_train)[:5, :]

In [None]:
X_train.iloc[:5, [3, 8, 9]]

In [None]:
df.isna().sum()

In [None]:
df['Cabin'].value_counts()

Now that we've specified which values were originally missing, let's fill in those missing values.  This takes two separate imputers because we want to use the mean for numeric data and the majority class for categorical data.

The `SimpleImputer` class fills in the mean value by default, so we'll have to override that for the categorical columns.

In [None]:
# Imputers
num_imput = None
cat_imput = None

# Pipelines
num_pipeline = None

## One-Hot Encoding

In [None]:
# if you wanted to drop
def drop_func():
    pass
    
from sklearn.preprocessing import FunctionTransformer    

drop_sklearn = FunctionTransformer(drop_func)

Now that there are no missing values, convert all of the categorical features into numbers.

In [None]:
cat_pipeline = None

### Bring them back together

In [None]:
num_feats = ['Age', 'SibSp', 'Parch', 'Fare']
cat_feats = ['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [None]:
col_tr = ColumnTransformer(transformers=[('num', num_pipeline, num_feats),
                                        ('cat', cat_pipeline, cat_feats)],
                          remainder='passthrough', sparse_threshold=0)

In [None]:
col_tr

In [None]:
# Train
col_tr.fit_transform(X_train).shape

In [None]:
# Test
col_tr.transform(X_test).shape

In [None]:
df['Name'].value_counts()

In [None]:
df['Ticket'].value_counts()

This is...a ridiculous number of columns.  How did we end up with more columns than rows?

# 3rd Model - After More Data Preparation

Let's run a logistic regression:
> We can incorporate both our Column Transformer and a model into a final top level Pipeline

In [None]:
# Bring in column transformer and a last step estimator/model
log_pipe = Pipeline(steps=[('ct', col_tr),
                           ('logreg', LogisticRegression(random_state=42, penalty='none'))])

log_pipe.fit(X_train, y_train)

What happened there?  This solver had no problem before.

## Hyperparameter Adjustments to the Model

Let's try a couple of stopgap measures to get the model to run.

### More Iterations

Allows for more iterations to find a solution

In [None]:
log_pipe2 = log_pipe = Pipeline(steps=[('ct', col_tr), 
                           ('logreg', LogisticRegression(random_state=42, penalty='none',
                                                         max_iter = 1000))])

log_pipe2.fit(X_train, y_train)

### More Regularization

Remember that the `C` parameter is the inverse of the regularization strength.

> Note: We could do regularization but we should first scale our features. We're actually going to skip this hyperparameter until we scale our data 

### Higher Tolerance

A higher tolerance means that the model will stop training earlier (when predictions and true values aren't as close as they could be).

In [None]:
log_pipe3 = Pipeline(steps=[('ct', col_tr), 
                           ('logreg', LogisticRegression(random_state=42, penalty='none', 
                                                         tol=25))])

log_pipe3.fit(X_train, y_train)

## Model Evaluation

In [None]:
fix, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))

axes[0].set_title("More Iterations")
axes[1].set_title("Higher Tolerance")

ConfusionMatrixDisplay(confusion_matrix(y_train, log_pipe2.predict(X_train))).plot(ax=axes[0])
ConfusionMatrixDisplay(confusion_matrix(y_train, log_pipe3.predict(X_train))).plot(ax=axes[1])

In [None]:
logreg_model_more_iterations_results = ModelWithCV(
                                        log_pipe2,
                                        'more_iterations',
                                        X_train,
                                        y_train
)
    
logreg_model_higher_tolerance_results = ModelWithCV(
                                        log_pipe3,
                                        'higher_tolerance',
                                        X_train,
                                        y_train
)

models = [
    logreg_model_more_iterations_results,
    logreg_model_higher_tolerance_results
]

In [None]:
log_pipe2.score(X_train, y_train)

In [None]:
log_pipe3.score(X_train, y_train)

In [None]:
f,axes = plt.subplots(ncols=2, sharey=True, figsize=(12, 6))

for ax, result in zip(axes, models):
    ax = result.plot_cv(ax)
    result.print_cv_summary()
plt.tight_layout();

In [None]:
fig, ax = plt.subplots()

fpr2, tpr2, thresh2 = roc_curve(y_train, log_pipe2.predict_proba(X_train)[:,1]) 
fpr3, tpr3, thresh3 = roc_curve(y_train, log_pipe3.predict_proba(X_train)[:,1])

RocCurveDisplay(fpr=fpr2, tpr=tpr2).plot(ax=ax)
RocCurveDisplay(fpr=fpr3, tpr=tpr3).plot(ax=ax);

> What can we observe from these two adjustments to our model with more features?

# Even More Data Preparation - Scaling

We saw that our last model is overfitting on so many features. A good strategy is to do regularization.

However, recall we should scale all of the features, so the model isn't overly penalizing age and fare.

In [None]:
num_pipeline2 = Pipeline(steps=[('im_num', SimpleImputer(add_indicator=True)), 
                               ('ss', StandardScaler())])

In [None]:
col_tr2 = ColumnTransformer(transformers=[('num', num_pipeline2, num_feats),
                                        ('cat', cat_pipeline, cat_feats)],
                          remainder='passthrough', sparse_threshold=0)

In [None]:
log_pipe4 = Pipeline([('ct', col_tr2), ('logreg', LogisticRegression(random_state=42))])

# 4th Model - After Scaling

Now that the data is scaled, let's see if we can fit the model without tweaking any hyperparameters.

In [None]:
log_pipe4.fit(X_train, y_train)

## Model Evaluation, Part 4

Now that we are able to run a logistic regression with default hyperparameters, let's see how that performs.

In [None]:
fig, ax = plt.subplots()

fig.suptitle("Logistic Regression with All Features, Scaled")

ConfusionMatrixDisplay(confusion_matrix(y_train, log_pipe4.predict(X_train))).plot(ax=ax)

In [None]:
log_pipe4.score(X_train, y_train)

In [None]:
model_results = ModelWithCV(
                            log_pipe4,
                            'all_features',
                            X_train,
                            y_train
)

In [None]:
# Plot CV results
fig, ax = plt.subplots()
ax = model_results.plot_cv(ax)
plt.tight_layout();
# Print CV results
model_results.print_cv_summary()

In [None]:
fpr4, tpr4, thresh4 = roc_curve(y_train, log_pipe4.predict_proba(X_train)[:,1])

RocCurveDisplay(fpr=fpr4, tpr=tpr4).plot()

Doing good on the training data, ~81% range on the test data ... this model is still overfitting.

# We should try hard feature elimination 

Recall what was happening with our name and ticket columns (OHE creating too many columns)

In [None]:
num_feats = ['Age', 'SibSp', 'Parch', 'Fare']
cat_feats = ['Pclass', 'Sex', 'Cabin', 'Embarked']

In [None]:
col_tr3 = ColumnTransformer(transformers=[('num', num_pipeline2, num_feats),
                                        ('cat', cat_pipeline, cat_feats)],
                          remainder='passthrough', sparse_threshold=0)

log_pipe5 = Pipeline([('ct', col_tr3), ('logreg', LogisticRegression(random_state=42))])
log_pipe5.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots()

fig.suptitle("Logistic Regression with Less Features, Scaled")

ConfusionMatrixDisplay(confusion_matrix(y_train, log_pipe5.predict(X_train))).plot(ax=ax)

In [None]:
log_pipe5.score(X_train, y_train)

In [None]:
model_results = ModelWithCV(
                            log_pipe5,
                            'less_features',
                            X_train,
                            y_train
)

In [None]:
# Plot CV results
fig, ax = plt.subplots()
ax = model_results.plot_cv(ax)
plt.tight_layout();
# Print CV results
model_results.print_cv_summary()

In [None]:
fpr4, tpr4, thresh4 = roc_curve(y_train, log_pipe5.predict_proba(X_train)[:,1])

RocCurveDisplay(fpr=fpr4, tpr=tpr4).plot()

## `SelectFromModel`

The all features model was probably overfitting. We might try thinning out the number of features by eliminating the ones with small modeling coefficients using [`SelectFromModel`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html)

In [None]:
selector = SelectFromModel(log_pipe4)

selector.fit(X_train, y_train)

We're using the default threshold here:

In [None]:
thresh = selector.threshold_

In [None]:
log_pipe4.coef_

In [None]:
log_pipe4.named_steps['logreg'].coef_

In [None]:
X_train_tr = col_tr2.fit_transform(X_train)

In [None]:
selector = SelectFromModel(LogisticRegression(random_state=42))
selector.fit(X_train_tr, y_train)

In [None]:
thresh = selector.threshold_
thresh

Let's get a sense of which features will be eliminated:

In [None]:
coefs = selector.estimator_.coef_
coefs

In [None]:
coefs.shape

In [None]:
coefs[abs(coefs) > thresh].shape

In [None]:
selector.get_support()

In [None]:
X_train_sel = selector.transform(X_train_tr)

In [None]:
X_train_tr.shape

In [None]:
X_train_sel.shape

In [None]:
log_reg_6 = LogisticRegression(random_state=42)

In [None]:
log_reg_6.fit(X_train_sel, y_train)

In [None]:
# Save for later comparison
select_results = ModelWithCV(
                    log_reg_6, 
                    'logreg_sel',
                    X_train_sel,
                    y_train
)

# Plot both all_features vs new model
f,axes = plt.subplots(ncols=2, sharey='all', figsize=(12, 6))

model_results.plot_cv(ax=axes[0])
select_results.plot_cv(ax=axes[1])

plt.tight_layout();

In [None]:
print("Old:", model_results.cv_results.mean())
print("New:", select_results.cv_results.mean())

Probably still overfitting, but let's call this our final model!

# Final Model Evaluation

Now that we have a final model, we would transform our X_test in accordance to what we did to X_train: Missing Indicators, Imputations, One Hot Encoding, Scaling, etc....

In [None]:
X_test

Create a model with the relevant hyperparameters, fit, and score

In [None]:
X_test_tr = col_tr2.transform(X_test)
X_test_tr.shape

In [None]:
X_test_sel = selector.transform(X_test_tr)
X_test_sel.shape

In [None]:
final_model = LogisticRegression(random_state=42)

final_model.fit(X_train_sel, y_train)

final_model.score(X_test_sel, y_test)

In [None]:
final_model2 = Pipeline(steps=[('ct', col_tr2),
                               ('selector', SelectFromModel(LogisticRegression(random_state=42))),
                               ('logreg', LogisticRegression(random_state=42))])

In [None]:
final_model2.fit(X_train, y_train)
final_model2.score(X_test, y_test)

In [None]:
fig, ax = plt.subplots()

fig.suptitle("Final Model")

ConfusionMatrixDisplay(confusion_matrix(y_test, final_model2.predict(X_test))).plot(ax=ax)

In [None]:
fpr, tpr, thresh = roc_curve(y_test, final_model2.predict_proba(X_train)[:,1])

RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
print(classification_report(y_test, final_model2.predict(X_test)))

# Exercise

Build and iterate on a logistic regression model of **color** for the diamonds dataset! Maximize accuracy.

In [None]:
diamonds = sns.load_dataset('diamonds')

In [None]:
diamonds.head()