> Objective:<br>
> 1) Illustrate how to use TPOT to automate the Machine Learning pipeline <br>
> 2) Predict whether or not he/she donates blood in Hsin-Chu City <br>
> Strategy: Utilize TPOT to select the best model, which suggests using LogisticRegression in this case <br>
>           Note: May require install TPOT by "conda install -c conda-forge tpot" <br>
> Data source:UCI Machine Learning Repository- https://archive.ics.uci.edu/ml/datasets/Blood+Transfusion+Service+Center <br>

In [72]:
# Import pandas
import pandas as pd

# Read in dataset
df = pd.read_csv('transfusion.data')

# Glimpse of the dataset
df.head(3)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1


In [73]:
# Summary of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
Recency (months)                              748 non-null int64
Frequency (times)                             748 non-null int64
Monetary (c.c. blood)                         748 non-null int64
Time (months)                                 748 non-null int64
whether he/she donated blood in March 2007    748 non-null int64
dtypes: int64(5)
memory usage: 29.3 KB


In [74]:
# Rename target column as 'target' 
df.rename(
    columns={'whether he/she donated blood in March 2007': 'target'},
    inplace=True
)

# check the renamed df
df.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1


In [75]:
# Print target incidence proportions
df.target.value_counts(normalize=True).round(3)

0    0.762
1    0.238
Name: target, dtype: float64

>  Generate training/testing data <br>

In [76]:
# Import train_test_split method
from sklearn.model_selection import train_test_split

# Split df into X_train, X_test, y_train and y_test,
# stratifying on the "target" column
X_train, X_test, y_train, y_test= train_test_split(
    df.drop(columns='target'),
    df.target,
    test_size=0.25,
    random_state=42,
    stratify=df.target
)

# X_train.head(2)

> Selecting model using TPOT <br>

In [77]:
# Import TPOTClassifier
from tpot import TPOTClassifier

# Instantiate TPOTClassifier
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=44,
    disable_update_check=True,
    config_dict='TPOT light'
)
tpot.fit(X_train, y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: 0.7424354492343548
Generation 2 - Current best internal CV score: 0.7424354492343548
Generation 3 - Current best internal CV score: 0.7424354492343548
Generation 4 - Current best internal CV score: 0.7433977184592779
Generation 5 - Current best internal CV score: 0.7433977184592779

Best pipeline: LogisticRegression(input_matrix, C=0.5, dual=False, penalty=l2)


TPOTClassifier(config_dict='TPOT light', crossover_rate=0.1, cv=5,
        disable_update_check=True, early_stop=None, generations=5,
        max_eval_time_mins=5, max_time_mins=None, memory=None,
        mutation_rate=0.9, n_jobs=1, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=20,
        random_state=44, scoring='roc_auc', subsample=1.0,
        template='RandomTree', use_dask=False, verbosity=2,
        warm_start=False)

In [78]:
# Import roc_auc_score
from sklearn.metrics import roc_auc_score
# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')



AUC score: 0.7850


In [79]:
# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    print(f'{idx}. {transform}')


Best pipeline steps:
1. LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


> Check the variance <br>

In [80]:
# X_train's variance
X_train.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [81]:
# based on the variance,
# we see that Monetary (c.c. blood) get more weight by the model 
# so this feature needs to be normalized using log normalization

In [82]:
# Import numpy
import numpy as np

# Copy X_train and X_test into X_train_normed and X_test_normed
X_train_normed ,X_test_normed = X_train.copy(), X_test.copy()

# Specify which column to normalize
col_to_normalize = 'Monetary (c.c. blood)'

# Log normalization
for df_ in [X_train_normed, X_test_normed]:
    # Add log normalized column
    df_['monetary_log'] = np.log(df_[col_to_normalize])
    # Drop the original column
    df_.drop(columns=col_to_normalize, inplace=True)

# Check the variance for X_train_normed
X_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

> Training the regression model <br>

In [83]:
# Importing modules
from sklearn import linear_model

# Instantiate LogisticRegression
logreg = linear_model.LogisticRegression(
    solver='liblinear',
    random_state=44
)

# Train the model
logreg.fit(X_train_normed, y_train)

# AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.7891


In [84]:
# Importing itemgetter
from operator import itemgetter

# Sort models based on their AUC score from highest to lowest
sorted(
    [('tpot', tpot_auc_score), ('logreg', logreg_auc_score)],
    key=itemgetter(1),
    reverse=True
)

[('logreg', 0.7890972663699937), ('tpot', 0.7849650349650349)]

> Conclusion:<br>
> The application of the automatic model selection using TPOT and AUC score is demonstrated, where the AUC score is 0.7850.<br>