In [None]:
# This notebook requires scikit-learn version 0.23.1 or later for some features.
# If you are using Google Colab, uncomment the line below, run it, and restart runtime.

# !pip install --upgrade scikit-learn

## Introduction to machine learning in Python with scikit-learn

### Instructor: Fred Feng (fredfeng@umich.edu)

***

### Prerequisites

- #### Some familiarity with Python and its common libraries 
    - #### numpy, pandas
    - #### [Introduction to Data Analysis in Python](https://youtu.be/7IsFmtvBOyc) workshop


- #### Basic understanding of regression & classification


### Goal of this tutorial

- #### Give you a taste and some hands-on experience of doing machine learning with scikit-learn.


### [Scikit-learn](https://scikit-learn.org/)

- #### A machine learning library in Python
- #### Open source and free
- #### Implemented a large number of common machine learning models
- #### Clean, uniform, and streamlined API
- #### Widely used across industries and academia


### What scikit-learn is *not* for

- #### In-depth statistical analysis, hypothesis testing

  - #### [StatsModels](https://www.statsmodels.org/)

- #### Deep learning, reinforcement learning

  - #### Karas (TensorFlow), PyTorch


***

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Part 1. Read in and explore the data. 

### Bank customer data for a marketing campaign [(Data source)](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing)

In [None]:
url = 'http://umich.edu/~fredfeng/workshops/bank.csv'

df = pd.read_csv(url)

df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

### Our goal is to develop a model that predicts whether a customer will subscribe the service or not.

In [None]:
df['subscribed'].value_counts()

In [None]:
df.groupby('subscribed').mean()

## Part 2. A logistic regression model with a few numerical features

### Step 1. Specifying what features to include by constructing a <font color="red">feature matrix</font>

In [None]:
X = df[['age', 'balance', 'duration']]

X.head()

### Step 2. Specify the target (i.e., the output of the model)

In [None]:
y = df['subscribed']

y

# np.sum(y)

###  A golden rule: <font color="red">Models should never be tested on the same data they were trained on.</font>

### Step 3. Split the data to a <font color="green">train set</font> and a <font color="DarkViolet">test set</font>

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.2, 
                                                    random_state=99, 
                                                    stratify=y
                                                   )

X_test.head()

In [None]:
y_test.head()

### Step 4. Instantiate the [logistic regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) classifier

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

### Step 5. Train the model by appying the `fit()` method based on the <font color="green">train set</font>

In [None]:
clf.fit(X_train, y_train)

In [None]:
print('Coefficients:', clf.coef_, '\nIntercept:', clf.intercept_)

### Step 6. Make predictions from the trained model for the <font color="DarkViolet">test set</font>

In [None]:
y_pred = clf.predict(X_test)

### Step 7. Evaluate the model by comparing the predictions with the target in the <font color="DarkViolet">test set</font>

In [None]:
np.column_stack((y_test, y_pred))[:10]

#### Confusion matrix

In [None]:
from sklearn import metrics

cm = metrics.confusion_matrix(y_test, y_pred)

cm

# pd.DataFrame(data=cm, 
#              columns=['predict: 0', 'predict: 1'], 
#              index=['true: 0', 'true: 1'])

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
print(metrics.classification_report(y_test, y_pred))

#### ROC curve

In [None]:
from sklearn.metrics import roc_curve

y_pred_proba = clf.predict_proba(X_test)[:, 1]

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(6, 5))

plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')

fontsize = 18
plt.xlabel('False positive rate', fontsize=fontsize)
plt.ylabel('True positive rate', fontsize=fontsize)
plt.title('ROC curve', fontsize=fontsize)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.xlim([0, 1])
plt.ylim([0, 1])

plt.show()

### Cross-validation

![k-fold cross validataion](https://miro.medium.com/max/3115/1*me-aJdjnt3ivwAurYkB7PA.png)

[Image source](https://medium.com/@sebastiannorena/some-model-tuning-methods-bfef3e6544f0)

In [None]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5)

# cv = KFold(n_splits=5, shuffle=True)

# cv = KFold(n_splits=5, shuffle=True, random_state=99)

for k in cv.split(X[:15]):
    print(k)

In [None]:
# for small or unbalanced data it's better to use stratified cross-validation

from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=88)

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(clf, X, y, cv=cv, scoring='roc_auc')

### Lastly, we can use the model to make predictions for out-of-sample data

In [None]:
X_new = pd.DataFrame(data=[[43, 3285, 1721], 
                           [58, 5920, 255]],
                     columns=X.columns, 
                     index=['Tom', 'Jerry'])

X_new

In [None]:
clf.predict(X_new)

## Part 3. Feature preprocessing

In [None]:
X = df[['marital', 'day', 'duration', 'campaign', 'previous']]

In [None]:
X['marital'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.2, 
                                                    random_state=99, 
                                                    stratify=y)

X_test.head()

### Preprocessing: [Imputation of missing values](https://scikit-learn.org/stable/modules/impute.html)

- ### Almost all real world data sets contain missing values.

- ### If we discard the row if it contains any missing values, we may end up losing a lot of data that may be valuable.

- ### A better strategy is to impute the missing values.


### Missing <font color="red">numerical</font> values can be imputed with the mean (by default) or median of the column.

In [None]:
X.isnull().sum() 

In [None]:
X[['day', 'campaign']].head()

In [None]:
X[['day', 'campaign']].mean()

In [None]:
from sklearn.impute import SimpleImputer

SimpleImputer().fit_transform(X[['day', 'campaign']])[:5]

### Missing <font color="red">categorical</font> values can be imputed with the <font color="green">most frequent</font> of the column.

In [None]:
X['marital'].head()

In [None]:
X['marital'].value_counts()

In [None]:
marital_imputed = SimpleImputer(strategy='most_frequent').fit_transform(X[['marital']])

marital_imputed[:5]

### Preprocessing: Encode categorical features using [One Hot Encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html).

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)

enc.fit_transform(marital_imputed)

In [None]:
enc.categories_

### Preprocessing: Scale the features

### [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html): scale a feature to zero mean and unit variance

### [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html): scale a feature to a given range (from 0 to 1 by default)

In [None]:
X['duration'].head()

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

StandardScaler().fit(X[['duration']]).transform(X[['duration']])

In [None]:
StandardScaler().fit_transform(X[['duration']])

In [None]:
MinMaxScaler().fit_transform(X[['duration']])

### [FunctionTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html): Constructs a transformer from a custom function.

### Here we make a logarithm transformation based on the  numpy [`log1p`](https://numpy.org/doc/stable/reference/generated/numpy.log1p.html) function.

In [None]:
from sklearn.preprocessing import FunctionTransformer

log_transformer = FunctionTransformer(np.log1p)

log_transformer.fit_transform(df[['duration']])

## Part 4. Machine learning pipelines

<img src="https://cdn.pixabay.com/photo/2014/10/30/23/04/pressure-water-line-509871_1280.jpg" alt="pipeline" style="width: 600px;"/>

<br>

<img src="http://umich.edu/~fredfeng/workshops/images/pipeline.png" alt="pipeline" style="width: 600px;"/>

### Why pipelines?

- ### It simplifies and automates the machine learning workflow.

- ### Separation of concerns: it separates the workflow into modular and reusable parts.

- ### It makes it harder to make mistakes.

    - ### e.g., it ensures the same preprocessings being used for train, test, and out-of-sample data

### "*If you are not using a pipeline, you are probably doing it wrong.*"

***

### Let's do the following preprocessings

- ### "marital": imputation --> one-hot encoding
- ### "duration": log tranformation --> standardization
- ### "day" & "campaign": imputation

### A [column transformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html#sklearn.compose.make_column_transformer) allows different columns to be transformed separately in parallel. 

<img src="https://cdn.pixabay.com/photo/2020/05/08/16/37/pipes-5146458_1280.jpg" alt="pipeline" style="width: 600px;"/>

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

imp_ohe = make_pipeline(SimpleImputer(strategy='most_frequent'),
                        OneHotEncoder(sparse=False)
                       )

imp_std = make_pipeline(SimpleImputer(), 
                        StandardScaler()
                       )

preprocessor = make_column_transformer((imp_ohe, ['marital']),
                                       (FunctionTransformer(np.log1p), ['duration']),
                                       (imp_std, ['day', 'campaign']), 
                                        remainder='passthrough')

### A [column selector](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html) allows selecting columns by data type or name pattern. 

### It can be useful when the data contains many features.

In [None]:
from sklearn.compose import make_column_selector

preprocessor = make_column_transformer((imp_ohe, make_column_selector(dtype_include=object)),
                                       (FunctionTransformer(np.log1p), ['duration']),
                                       (imp_std, ['day', 'campaign']), 
                                        remainder='passthrough')

In [None]:
X_train.head()

In [None]:
X_train.dtypes


In [None]:
np.set_printoptions(edgeitems=10, suppress=True)
preprocessor.fit_transform(X_train).round(3)

In [None]:
pipe = make_pipeline(preprocessor, clf)

pipe.fit(X_train, y_train)

### Visualize a pipeline

In [None]:
from sklearn import set_config
set_config(display='diagram')

pipe

### Use the pipeline to make predictions for out-of-sample data

In [None]:
X_new = pd.DataFrame(data=[['single', 20, 16, 5, 4], 
                           ['married', 7, 352, 2, 0]],
                     columns=X.columns, 
                     index=['Tom', 'Jerry'])

X_new

In [None]:
pipe.predict(X_new)

### Cross-validation with a pipeline

In [None]:
cross_val_score(pipe, X, y, cv=5, scoring='roc_auc')

### Now let's try another classification model [k-nearest neighbors](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html).

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)

In [None]:
pipe = make_pipeline(preprocessor, clf) # we reuse the same preprocessor from earlier

pipe

In [None]:
pipe.fit(X_train, y_train)

In [None]:
cross_val_score(pipe, X, y, cv=5, scoring='roc_auc')

### [Grid search cross-validation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) for hyperparameter tuning and model selection

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
pipe.get_params()

In [None]:
param_grid = {
    'columntransformer__pipeline-2__standardscaler': [StandardScaler(), MinMaxScaler(), 'passthrough'],
    'columntransformer__pipeline-2__simpleimputer__strategy': ['mean', 'median'],
    'kneighborsclassifier__n_neighbors': [3, 5, 7], 
    'kneighborsclassifier__metric': ['euclidean', 'manhattan'], 
    'kneighborsclassifier__weights': ['uniform', 'distance']
}

In [None]:
gs = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

gs.fit(X_train, y_train)

gs.best_params_

### In the above example, we set up 

- ### 3 options for the standardization
- ### 2 options for the imputation
- ### 3 options for the number of neighbors in knn
- ### 2 options for the distance metric in knn
- ### 2 options for the weights in knn

### That is a total of $3\times2\times3\times2\times2=72$ model configurations.

### With a 5-fold cross-validation for each, we did a total of $72\times5=360$ model fittings to the data.

***

### A summary of scikit-learn's uniform APIs 

- ### scikit-learn's main API is implemented around so-called estimators.

- ### An estimator is any object that learns from data (e.g., a regression or classification model, or a transformer such as a scaler).

<br>

- ### Fit estimator to data: `estimator.fit(X, [y])`
- ### Transform data using fitted estimator: `estimator.transform(X)`
    - ### e.g., preprocessing, dimentionality reduction
- ### Predict using fitted estimator: `estimator.predict(X)`
    - ### e.g., regression, classification, clustering

### Further resources

- ### [scikit-learn official examples page](https://scikit-learn.org/stable/auto_examples/index.html)

