# Data Preparation

In [1]:
#import libraries
import tabulate
import pandas as pd
import seaborn as sns
import numpy as np

#preprocessing
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE, RandomOverSampler

#normalization
from sklearn import preprocessing

#classification
from sklearn.model_selection import cross_val_score, ShuffleSplit, cross_val_predict, KFold, cross_validate
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
# bool columns in dataframe
global bool_columns
bool_columns = ['Smokes','Hormonal Contraceptives', 'IUD', 'STDs',
                'STDs:condylomatosis', 'STDs:cervical condylomatosis',
                'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
                'STDs:syphilis', 'STDs:pelvic inflammatory disease', 
                'STDs:genital herpes', 'STDs:molluscum contagiosum',
                'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV',
                'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx']

# Function definitions

In [3]:
def meanImputation(df):
    """Data imputation using mean/mode of columns
    Following the methods described in: Razaliet al. (2020).
    Risk Factors of Cervical Cancer using Classification in Data Mining.
    Journal of Physics: Conference Series. 1529. 022102. 10.1088/1742-6596/1529/2/022102.

    Missing values for attribute that have integer data type were filled using the sample mean
    while boolean were filled using the sample mode.
    """
        
    # replace NaN with mode for columns with dtype bool
    for label, col in df.iteritems():
        if label in bool_columns:
            columns_mode = col.mode()
            # method 1
            df[label] = col.fillna(columns_mode[0])

            #convert column to bool while we're at it
            df[label] = df[label].astype('bool')

    float_columns = df.select_dtypes(include=['float64']).columns

    # replace NaN with mean for columns with dtype float
    for col in float_columns:
        columns_mean = df[col].mean()
        df[col] = df[col].fillna(columns_mean)
        
    return df

# Custom transformers

## Data imputation

Method 1: MeanImputationTransformer
Following the methods described in:
Razali, Nazim & Mostafa, Salama & Mustapha, Aida & Abd Wahab, Mohd Helmy & Ibrahim, Nurul. (2020). Risk Factors of Cervical Cancer using Classification in Data Mining. Journal of Physics: Conference Series. 1529. 022102. 10.1088/1742-6596/1529/2/022102. 

"Missing values for attribute that have integer data type were filled using the sample mean while boolean
were filled using the sample mode."

Method 2: UniqueValueImputationTransformer
Sets unique value -99 everywhere where data is missing. Value is chosen as it doesn't have any real-life meaning.

In [4]:
class MeanImputationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        #print("Mean value imputation called.")
        X_ = X.copy() # create copy to avoid changes to original dataset
        X_ = meanImputation(X_)
        return X_

In [5]:
class UniqueValueImputationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        #print("Unique value imputation called.")
        X_ = X.copy() # create copy to avoid changes to original dataset
        X_ = X_.fillna(-99)
        return X_

# Loading Dataset

In [6]:
# https://archive-beta.ics.uci.edu/ml/datasets/cervical+cancer+risk+factors
"""
The dataset was collected at 'Hospital Universitario de Caracas' in Caracas, Venezuela.
The dataset comprises demographic information, habits, and historic medical records of 858 patients.
Several patients decided not to answer some of the questions because of privacy concerns (missing values).
"""

rf = pd.read_csv('risk_factors_cervical_cancer.csv',encoding='utf8')
#rf.info()

# Covert data to usable datatypes

In [7]:
# replace missing values with NaN
rf = rf.replace('?',np.nan)

# covert everything to float64, some classes will be converted to bool once missing values are taken care of
for label, col in rf.iteritems():
    rf[label] = pd.to_numeric(col, errors='coerce')

#TODO: some things are still int after this and object types get transformed to bool automatically. But NaN values
#are not falsly converted to True so that should be okay.

# Examine data

## Check missing values per column

In [8]:
percent_missing = rf.isnull().sum() * 100 / len(rf)
missing_value_df = pd.DataFrame({'column_name': rf.columns,
                                 'percent_missing': percent_missing})

In [9]:
# remove columns with more than 20% missing
selection_columns = missing_value_df.loc[missing_value_df['percent_missing'] >= 20].iloc[:, 0]

for col in selection_columns:
    rf = rf.drop([col], axis=1)

## Check missing values per row

In [10]:
missing_count = []
for idx in range(len(rf)):
    missing = rf.loc[[idx]].isna().sum().sum()
    missing_count.append(missing)

removed = 0
for idx, val in enumerate(missing_count):
    # Remove all rows where more than 15% of the data are missing
    # If 20% is chosen, 106 patients are excluded
    if val >= len(rf.columns)*0.15:
        rf = rf.drop(idx)
        removed+=1
print("{} number of rows were removed".format(removed))

106 number of rows were removed


## Investigate class imbalances

In [11]:
# check if STDs (number) and STDs: Number of diagnosis are the same
STD_comparison = np.where((rf['STDs (number)']==rf['STDs: Number of diagnosis']), True, False)

# check manual calculation with STDs(number) if STDS (number) and STDs: Number of diagnosis not the same
if not STD_comparison.all():
    # create a new column with the sum of all STDs
    rf['STDs (number manual)'] = (rf['STDs:condylomatosis'] + rf['STDs:cervical condylomatosis'] +
        rf['STDs:vaginal condylomatosis'] + rf['STDs:vulvo-perineal condylomatosis'] +
        rf['STDs:syphilis'] + rf['STDs:pelvic inflammatory disease'] +
        rf['STDs:genital herpes'] + rf['STDs:molluscum contagiosum'] +
        rf['STDs:AIDS'] + rf['STDs:HIV'] +
        rf['STDs:Hepatitis B'] + rf['STDs:HPV'])
    
    # if they're the same drop the new column again
    manual_comp = np.where((rf['STDs (number)']==rf['STDs (number manual)']), True, False)
    if manual_comp.all():
        rf = rf.drop(['STDs (number manual)'], axis=1)

In [12]:
# we don't know exactly what this column
# as our goal is to create a transparent model we decided to exclude the column
rf = rf.drop(['STDs: Number of diagnosis'], axis=1)

# Classification

## Explainable boosting classifier metrics
## For pipelines with different oversampling and imputation methods

In [13]:
# split dataset in features and target variable
feature_cols = rf.columns[0:len(rf.columns)-4]
X = rf[feature_cols] # Features
y = rf.Biopsy # Target variable

In [14]:
cv = None # default 5-fold cross validation
#cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
#cv = KFold(n_splits=5)

In [15]:
# parameters for SMOTE oversampling
# for reproducibility purposes
seed = 100
# SMOTE number of neighbors
k = 1

In [16]:
# pipeline 1 using unique value imputation and random oversampling
pipe_1 = make_pipeline(SimpleImputer(fill_value=-99), RandomOverSampler('minority'), RandomForestClassifier())
scores_1 = cross_validate(pipe_1, X, y, cv=cv, scoring=['accuracy','roc_auc','f1', 'precision','recall'])

#y_pred_1 = cross_val_predict(pipe_1, X, y, cv=None) 
#confusion_matrix(y, y_pred_1)

In [17]:
# pipeline 2 using mean/mode value imputation and random oversampling

pipe_2 = make_pipeline(MeanImputationTransformer(), RandomOverSampler('minority'), RandomForestClassifier())
scores_2 = cross_validate(pipe_2, X, y, cv=cv, scoring=['accuracy','roc_auc','f1', 'precision','recall'])
scores_2

#y_pred_2 = cross_val_predict(pipe_2, X, y, cv=None) 
#confusion_matrix(y, y_pred_2)

{'fit_time': array([0.10909224, 0.10742903, 0.1089108 , 0.10862112, 0.10711884]),
 'score_time': array([0.02949715, 0.0291841 , 0.02995896, 0.02952194, 0.02939701]),
 'test_accuracy': array([0.91390728, 0.90728477, 0.89333333, 0.93333333, 0.9       ]),
 'test_roc_auc': array([0.73538961, 0.58668831, 0.55214286, 0.72535714, 0.63080445]),
 'test_f1': array([0.13333333, 0.        , 0.        , 0.375     , 0.11764706]),
 'test_precision': array([0.25      , 0.        , 0.        , 0.5       , 0.16666667]),
 'test_recall': array([0.09090909, 0.        , 0.        , 0.3       , 0.09090909])}

In [18]:
# pipeline 3 using unique value imputation and smote oversampling

sm = SMOTE(sampling_strategy='auto', k_neighbors=k, random_state=seed)

pipe_3 = make_pipeline(SimpleImputer(fill_value=-99), sm, RandomForestClassifier())
scores_3 = cross_validate(pipe_3, X, y, cv=cv, scoring=['accuracy','roc_auc','f1', 'precision','recall'])
scores_3


#y_pred_3 = cross_val_predict(pipe_3, X, y, cv=None) 
#confusion_matrix(y, y_pred_3)

{'fit_time': array([0.10962987, 0.10777307, 0.10829401, 0.10932899, 0.10658693]),
 'score_time': array([0.01461983, 0.01436996, 0.01413798, 0.01478791, 0.01434278]),
 'test_accuracy': array([0.9205298 , 0.91390728, 0.93333333, 0.94      , 0.88666667]),
 'test_roc_auc': array([0.69480519, 0.5474026 , 0.51357143, 0.71964286, 0.58338784]),
 'test_f1': array([0.14285714, 0.        , 0.        , 0.30769231, 0.10526316]),
 'test_precision': array([0.33333333, 0.        , 0.        , 0.66666667, 0.125     ]),
 'test_recall': array([0.09090909, 0.        , 0.        , 0.2       , 0.09090909])}

In [19]:
# pipeline 3 using mean/mode imputation and smote oversampling

sm = SMOTE(sampling_strategy='auto', k_neighbors=k, random_state=seed)

pipe_4 = make_pipeline(MeanImputationTransformer(), sm, RandomForestClassifier())
scores_4 = cross_validate(pipe_4, X, y, cv=cv, scoring=['accuracy','roc_auc','f1', 'precision','recall'])
scores_4

#y_pred_4 = cross_val_predict(pipe_4, X, y, cv=None) 
#confusion_matrix(y, y_pred_4)

{'fit_time': array([0.11711407, 0.11458683, 0.11444092, 0.11547709, 0.11526823]),
 'score_time': array([0.03003621, 0.0291059 , 0.02902794, 0.02936697, 0.03066492]),
 'test_accuracy': array([0.91390728, 0.90066225, 0.93333333, 0.94      , 0.9       ]),
 'test_roc_auc': array([0.63603896, 0.67727273, 0.47928571, 0.68678571, 0.55166776]),
 'test_f1': array([0.13333333, 0.        , 0.        , 0.30769231, 0.11764706]),
 'test_precision': array([0.25      , 0.        , 0.        , 0.66666667, 0.16666667]),
 'test_recall': array([0.09090909, 0.        , 0.        , 0.2       , 0.09090909])}

In [20]:
# All pipelines for Gradient Boosting

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, 
                                 max_depth=1, random_state=seed)
sm = SMOTE(sampling_strategy='auto', k_neighbors=k, random_state=seed)

pipe_5 = make_pipeline(SimpleImputer(fill_value=-99), RandomOverSampler('minority'), clf)
scores_5 = cross_validate(pipe_5, X, y, cv=cv, scoring=['accuracy','roc_auc','f1', 'precision','recall'])
scores_5

pipe_6 = make_pipeline(MeanImputationTransformer(), RandomOverSampler('minority'), clf)
scores_6 = cross_validate(pipe_6, X, y, cv=cv, scoring=['accuracy','roc_auc','f1', 'precision','recall'])

pipe_7 = make_pipeline(SimpleImputer(fill_value=-99), sm, clf)
scores_7 = cross_validate(pipe_7, X, y, cv=cv, scoring=['accuracy','roc_auc','f1', 'precision','recall'])


pipe_8 = make_pipeline(MeanImputationTransformer(), sm, clf)
scores_8 = cross_validate(pipe_8, X, y, cv=cv, scoring=['accuracy','roc_auc','f1', 'precision','recall'])

#y_pred_4 = cross_val_predict(pipe_4, X, y, cv=None) 
#confusion_matrix(y, y_pred_4)

**VISUALIZE RESULTS**

In [25]:
significant_numbers = 2 # used for rounding

combinations = [{'model':'Random forest', 'sampling':'random', 'imputation':'unique', 'scores':scores_1},
                {'model':'Random forest', 'sampling':'random', 'imputation':'mean/mode', 'scores':scores_2},
                {'model':'Random forest', 'sampling':'SMOTE', 'imputation':'unique', 'scores':scores_3},
                {'model':'Random forest', 'sampling':'SMOTE', 'imputation':'mean/mode', 'scores':scores_4},
                {'model':'Gradient boosting', 'sampling':'random', 'imputation':'unique', 'scores':scores_5},
                {'model':'Gradient boosting', 'sampling':'random', 'imputation':'mean/mode', 'scores':scores_6},
                {'model':'Gradient boosting', 'sampling':'SMOTE', 'imputation':'unique', 'scores':scores_7},
                {'model':'Gradient boosting', 'sampling':'SMOTE', 'imputation':'mean/mode', 'scores':scores_8}]

config_parameters = ['model', 'sampling', 'imputation']

headers = ["Model", "sampling", "imputation", "accuracy", "roc_auc", "f1", "precision", "recall"]
eval_metrics = ['test_accuracy', 'test_roc_auc', 'test_f1', 'test_precision', 'test_recall']

In [26]:
data = []
data.append(headers)

for combination in combinations:
    summary = [combination[param] for param in config_parameters]
    
    for metric in eval_metrics:
        mean = combination['scores'][metric].mean().round(significant_numbers)
        std = combination['scores'][metric].std().round(significant_numbers)
        summary.append(f'{mean} +- {std}')
        
    data.append(summary)
    
table = tabulate.tabulate(data, tablefmt='html')
table

0,1,2,3,4,5,6,7
Model,sampling,imputation,accuracy,roc_auc,f1,precision,recall
Random forest,random,unique,0.91 +- 0.01,0.62 +- 0.09,0.11 +- 0.11,0.17 +- 0.18,0.08 +- 0.07
Random forest,random,mean/mode,0.91 +- 0.01,0.65 +- 0.07,0.13 +- 0.14,0.18 +- 0.19,0.1 +- 0.11
Random forest,SMOTE,unique,0.92 +- 0.02,0.61 +- 0.08,0.11 +- 0.11,0.22 +- 0.25,0.08 +- 0.07
Random forest,SMOTE,mean/mode,0.92 +- 0.02,0.61 +- 0.08,0.11 +- 0.11,0.22 +- 0.24,0.08 +- 0.07
Gradient boosting,random,unique,0.76 +- 0.03,0.57 +- 0.04,0.14 +- 0.05,0.09 +- 0.04,0.28 +- 0.11
Gradient boosting,random,mean/mode,0.79 +- 0.03,0.55 +- 0.04,0.17 +- 0.07,0.12 +- 0.05,0.3 +- 0.1
Gradient boosting,SMOTE,unique,0.87 +- 0.04,0.53 +- 0.06,0.13 +- 0.05,0.15 +- 0.08,0.13 +- 0.05
Gradient boosting,SMOTE,mean/mode,0.85 +- 0.03,0.55 +- 0.04,0.14 +- 0.05,0.12 +- 0.04,0.17 +- 0.08
