In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from tpot import TPOTClassifier


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score


### Loading the blood donations data


In [5]:
df = pd.read_csv('transfusion.data')
df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


### Inspecting transfusion DataFrame


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


### Creating target column


In [11]:
df['target'] = df['whether he/she donated blood in March 2007']
df.drop('whether he/she donated blood in March 2007',axis=1,inplace=True)

In [12]:
df

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


### Checking target incidence


In [14]:
df['target'].value_counts()

target
0    570
1    178
Name: count, dtype: int64

### Splitting transfusion into train and test datasets


In [20]:
y = df['target']
X = df.drop('target',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Selecting model using TPOT


In [25]:
tpot = TPOTClassifier(generations=5,verbosity=2)

tpot.fit(X_train,y_train)

Optimization Progress:   0%|          | 0/600 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8124158415841585

Generation 2 - Current best internal CV score: 0.8143762376237624

Generation 3 - Current best internal CV score: 0.8143762376237624

Generation 4 - Current best internal CV score: 0.8184158415841585

Generation 5 - Current best internal CV score: 0.8184158415841585

Best pipeline: LogisticRegression(BernoulliNB(ZeroCount(RobustScaler(input_matrix)), alpha=0.01, fit_prior=True), C=20.0, dual=False, penalty=l2)


In [40]:
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7262


### Checking the variance


In [47]:
X_train.var().round(2)

Recency (months)              71.43
Frequency (times)             40.33
Monetary (c.c. blood)    2520551.40
Time (months)                620.94
dtype: float64

### Log normalization


In [71]:
df_new = df.copy()

df_new['Monetary_Log'] = np.log(df_new['Monetary (c.c. blood)'])
df_new.drop('Monetary (c.c. blood)',axis=1,inplace=True)

df_new


Unnamed: 0,Recency (months),Frequency (times),Time (months),target,Monetary_Log
0,2,50,98,1,9.433484
1,0,13,28,1,8.086410
2,1,16,35,1,8.294050
3,2,20,45,1,8.517193
4,1,24,77,0,8.699515
...,...,...,...,...,...
743,23,2,38,0,6.214608
744,21,2,52,0,6.214608
745,23,3,62,0,6.620073
746,39,1,39,0,5.521461


In [76]:
X = df_new.drop('target',axis=1)

X_train_normed, X_test_normed, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train_normed.var().round(3)

Recency (months)      71.434
Frequency (times)     40.329
Time (months)        620.942
Monetary_Log           0.872
dtype: float64