In [4]:
# Setup
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from scipy.io import arff

In [6]:
# Load Data
data = arff.loadarff('bone-marrow.arff')
df = pd.DataFrame(data[0])
df.drop(columns=['Disease'], inplace=True)
df.head()

Unnamed: 0,Recipientgender,Stemcellsource,Donorage,Donorage35,IIIV,Gendermatch,DonorABO,RecipientABO,RecipientRh,ABOmatch,...,extcGvHD,CD34kgx10d6,CD3dCD34,CD3dkgx10d8,Rbodymass,ANCrecovery,PLTrecovery,time_to_aGvHD_III_IV,survival_time,survival_status
0,b'1',b'1',22.830137,b'0',b'1',b'0',b'1',b'1',b'1',b'0',...,b'1',7.2,1.33876,5.38,35.0,19.0,51.0,32.0,999.0,0.0
1,b'1',b'0',23.342466,b'0',b'1',b'0',b'-1',b'-1',b'1',b'0',...,b'1',4.5,11.078295,0.41,20.6,16.0,37.0,1000000.0,163.0,1.0
2,b'1',b'0',26.394521,b'0',b'1',b'0',b'-1',b'-1',b'1',b'0',...,b'1',7.94,19.01323,0.42,23.4,23.0,20.0,1000000.0,435.0,1.0
3,b'0',b'0',39.684932,b'1',b'1',b'0',b'1',b'2',b'1',b'1',...,b'?',4.25,29.481647,0.14,50.0,23.0,29.0,19.0,53.0,1.0
4,b'0',b'1',33.358904,b'0',b'0',b'0',b'1',b'2',b'0',b'1',...,b'1',51.85,3.972255,13.05,9.0,14.0,14.0,1000000.0,2043.0,0.0


In [8]:
# Convert all columns to numeric and coerce errors to null values
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors='coerce')

In [9]:
# Ensure binary columns are hot encoded
for c in df.columns[df.nunique() == 2]:
    df[c] = (df[c] == 1) * 1.0

In [10]:
# 1. Calculate the number of unique values for each column
print('Count of unique values in each column:')
df.nunique()

Count of unique values in each column:


Recipientgender           2
Stemcellsource            2
Donorage                187
Donorage35                2
IIIV                      2
Gendermatch               2
DonorABO                  4
RecipientABO              4
RecipientRh               2
ABOmatch                  2
CMVstatus                 4
DonorCMV                  2
RecipientCMV              2
Riskgroup                 2
Txpostrelapse             2
Diseasegroup              2
HLAmatch                  4
HLAmismatch               2
Antigen                   4
Alel                      5
HLAgrI                    7
Recipientage            125
Recipientage10            2
Recipientageint           3
Relapse                   2
aGvHDIIIIV                2
extcGvHD                  2
CD34kgx10d6             183
CD3dCD34                182
CD3dkgx10d8             163
Rbodymass               130
ANCrecovery              18
PLTrecovery              50
time_to_aGvHD_III_IV     28
survival_time           174
survival_status     

In [12]:
# 2. Set target, survival_status,as y; features (dropping survival status and time) as X
y = df.survival_status
X = df.drop(columns=['survival_status', 'survival_time'])

In [13]:
# 3. Define lists of numeric and categorical columns based on number of unique values
num_cols = X.columns[X.nunique() > 7]
cat_cols = X.columns[X.nunique() <= 7]
num_cols, cat_cols

(Index(['Donorage', 'Recipientage', 'CD34kgx10d6', 'CD3dCD34', 'CD3dkgx10d8',
        'Rbodymass', 'ANCrecovery', 'PLTrecovery', 'time_to_aGvHD_III_IV'],
       dtype='object'),
 Index(['Recipientgender', 'Stemcellsource', 'Donorage35', 'IIIV',
        'Gendermatch', 'DonorABO', 'RecipientABO', 'RecipientRh', 'ABOmatch',
        'CMVstatus', 'DonorCMV', 'RecipientCMV', 'Riskgroup', 'Txpostrelapse',
        'Diseasegroup', 'HLAmatch', 'HLAmismatch', 'Antigen', 'Alel', 'HLAgrI',
        'Recipientage10', 'Recipientageint', 'Relapse', 'aGvHDIIIIV',
        'extcGvHD'],
       dtype='object'))

In [14]:
# 4. Print columns with missing values
print('Columns with missing values:')
X.columns[X.isnull().sum() > 0]

Columns with missing values:


Index(['RecipientABO', 'CMVstatus', 'Antigen', 'Alel', 'CD3dCD34',
       'CD3dkgx10d8', 'Rbodymass'],
      dtype='object')

In [16]:
# 5. Split data into train/test split
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [17]:
# 6. Create categorical preprocessing pipeline
# Using mode to fill in missing values and OHE
cat_vals = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("ohe", OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'))
])
cat_vals

In [18]:
# 7. Create numerical preprocessing pipeline
# Using mean to fill in missing values and standard scaling of features
num_vals = Pipeline([
    ("imputer", SimpleImputer(strategy='mean')),
    ("scale", StandardScaler())
])
num_vals

In [19]:
# 8. Create column transformer that will preprocess the numerical and categorical features separately
preprocess = ColumnTransformer(transformers=[
    ('cat_process', cat_vals, cat_cols),
    ('num_process', num_vals, num_cols)
])

In [20]:
# 9. Create a pipeline with preprocess, PCA, and a logistic regresssion model
pipeline = Pipeline([
    ("preprocess", preprocess),
    ("pca", PCA()),
    ("clf", LogisticRegression())
])

In [21]:
# 10. Fit the pipeline on the training data
pipeline.fit(x_train, y_train)

In [23]:
#Predict the pipeline on the test data
print("Pipeline Accuracy Test Set:")
pipeline.score(x_test, y_test)

Pipeline Accuracy Test Set:


0.7894736842105263

In [26]:
# 11. Define search space of hyperparameters
search_space = [{'clf':[LogisticRegression()],
                     'clf__C': np.logspace(-4, 2, 10),
                'pca__n_components':np.linspace(30,37,3).astype(int)},
                   ]

In [27]:
#12. Search over hyperparameters above to optimize pipeline and fit
gs = GridSearchCV(pipeline, search_space, cv=5)
gs.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [29]:
# 13. Save the best estimator from the gridsearch and print attributes and final accuracy on test set
best_model = gs.best_estimator_

In [30]:
# 14. Print attributes of best_model
print('The best classification model is:')
best_model.named_steps['clf']

The best classification model is:


In [31]:
print('The hyperparameters of the best classification model are:')
best_model.named_steps['clf'].get_params()

The hyperparameters of the best classification model are:


{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [32]:
print('The number of components selected in the PCA step are:')
best_model.named_steps['pca'].n_components

The number of components selected in the PCA step are:


37

In [33]:
# 15. Print final accuracy score
print("Best Model Accuracy Test Set")
best_model.score(x_test, y_test)

Best Model Accuracy Test Set


0.8157894736842105