In [23]:
#Importing all the necessary libraries for the project

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from operator import itemgetter
from tpot import TPOTClassifier

In [8]:
#Loading and inspecting the data
transfusion_df = pd.read_csv(r'C:\Users\user\Desktop\datasets\transfusion.data')
transfusion_df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [10]:
#Inspecting the info from dataframe and summary statistics
transfusion_df.info()
print('\n')
transfusion_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB




Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [11]:
#Renaming the target column
transfusion_df.rename(columns = {'whether he/she donated blood in March 2007': 'target'}, inplace = True)
transfusion_df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [19]:
#Printing target incidence proportions and rounding output to 3 decimal places
transfusion_df['target'].value_counts(normalize = True).round(3)

0    0.762
1    0.238
Name: target, dtype: float64

In [22]:
# Splitting transfusion_df DataFrame into
# X_train, X_test, y_train and y_test datasets,
# stratifying on the `target` column
X_train, X_test, y_train, y_test = train_test_split(transfusion_df.drop(columns = 'target'), transfusion_df['target'], 
                                                    test_size =.3, stratify = transfusion_df['target'])
X_train.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26
116,2,7,1750,46
661,16,2,500,16
154,2,1,250,2


In [32]:
# Instantiating TPOTClassifier
tpot = TPOTClassifier(generations=5, population_size=25, verbosity=2, scoring='roc_auc',
                      disable_update_check=True,
                      config_dict='TPOT light')

tpot.fit(X_train, y_train)

# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')

# Printing best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Printing idx and transform
    print(f'{idx}. {transform}')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=150.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.7353191779593968

Generation 2 - Current best internal CV score: 0.7353191779593968

Generation 3 - Current best internal CV score: 0.7404230243081132

Generation 4 - Current best internal CV score: 0.7404230243081132

Generation 5 - Current best internal CV score: 0.7404230243081132

Best pipeline: MultinomialNB(MaxAbsScaler(input_matrix), alpha=10.0, fit_prior=True)

AUC score: 0.7788

Best pipeline steps:
1. MaxAbsScaler(copy=True)
2. MultinomialNB(alpha=10.0, class_prior=None, fit_prior=True)


In [25]:
# Checking the X_train's variance and rounding the output to 3 decimal places
X_train.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [27]:
# Copying X_train and X_test into X_train_norm and X_test_norm
X_train_norm, X_test_norm = X_train.copy(), X_test.copy()

# Specify which column to normalize
logNorm_col = 'Monetary (c.c. blood)'

# Log normalization
for df in [X_train_norm, X_test_norm]:
    df['monetary_logNorm'] = np.log(df[logNorm_col])
    df.drop(columns = logNorm_col, inplace = True)
    
# Checking the variance for X_train_norm
X_train_norm.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_logNorm       0.837
dtype: float64

In [33]:
# Instantiating LogisticRegression
lr = LogisticRegression(solver = 'liblinear')

# Train the model
lr.fit(X_train_norm, y_train)
# AUC score for logistic regression model
lr_auc_score = roc_auc_score(y_test, lr.predict_proba(X_test_norm)[:, 1])
print(f'\nAUC score: {lr_auc_score:.3f}')


AUC score: 0.789


TPOT Classifier Score : 0.778

Logistic Regression Score : 0.789