# Data Preparation

In [1]:
#import libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

#smote data rebalancing
from imblearn.over_sampling import SMOTE

#normalization
from sklearn import preprocessing

#classification
from sklearn.model_selection import train_test_split
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

# Loading Dataset

In [2]:
# https://archive-beta.ics.uci.edu/ml/datasets/cervical+cancer+risk+factors
"""
The dataset was collected at 'Hospital Universitario de Caracas' in Caracas, Venezuela.
The dataset comprises demographic information, habits, and historic medical records of 858 patients.
Several patients decided not to answer some of the questions because of privacy concerns (missing values).
"""

rf = pd.read_csv('risk_factors_cervical_cancer.csv',encoding='utf8')
rf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 858 non-null    int64 
 1   Number of sexual partners           858 non-null    object
 2   First sexual intercourse            858 non-null    object
 3   Num of pregnancies                  858 non-null    object
 4   Smokes                              858 non-null    object
 5   Smokes (years)                      858 non-null    object
 6   Smokes (packs/year)                 858 non-null    object
 7   Hormonal Contraceptives             858 non-null    object
 8   Hormonal Contraceptives (years)     858 non-null    object
 9   IUD                                 858 non-null    object
 10  IUD (years)                         858 non-null    object
 11  STDs                                858 non-null    object

# Covert data to usable datatypes

In [3]:
# replace missing values with NaN
rf = rf.replace('?',np.nan)

# covert everything to float64, some classes will be converted to bool once missing values are taken care of
for label, col in rf.iteritems():
    rf[label] = pd.to_numeric(col, errors='coerce')

#TODO: Age is still int after this, why?

# Examine data

## Check missing values per column

In [4]:
percent_missing = rf.isnull().sum() * 100 / len(rf)
missing_value_df = pd.DataFrame({'column_name': rf.columns,
                                 'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
Age,Age,0.0
Number of sexual partners,Number of sexual partners,3.030303
First sexual intercourse,First sexual intercourse,0.815851
Num of pregnancies,Num of pregnancies,6.526807
Smokes,Smokes,1.515152
Smokes (years),Smokes (years),1.515152
Smokes (packs/year),Smokes (packs/year),1.515152
Hormonal Contraceptives,Hormonal Contraceptives,12.587413
Hormonal Contraceptives (years),Hormonal Contraceptives (years),12.587413
IUD,IUD,13.636364


In [5]:
# remove columns with more than 20% missing
selection_columns = missing_value_df.loc[missing_value_df['percent_missing'] >= 20].iloc[:, 0]

for col in selection_columns:
    rf = rf.drop([col], axis=1)

## Investigate STDs (number) vs STDs: Number of diagnosis

In [6]:
# check if STDs (number) and STDs: Number of diagnosis are the same
STD_comparison = np.where((rf['STDs (number)']==rf['STDs: Number of diagnosis']), True, False)

# remove one of the columns if they're not the same and replace values in the other column with newly calc values
if not STD_comparison.all():
    rf = rf.drop(['STDs: Number of diagnosis'], axis=1)
    # create a new column with the sum of all STDs
    rf['STDs (number)'] = (rf['STDs:condylomatosis'] + rf['STDs:cervical condylomatosis'] +
        rf['STDs:vaginal condylomatosis'] + rf['STDs:vulvo-perineal condylomatosis'] +
        rf['STDs:syphilis'] + rf['STDs:pelvic inflammatory disease'] +
        rf['STDs:genital herpes'] + rf['STDs:molluscum contagiosum'] +
        rf['STDs:AIDS'] + rf['STDs:HIV'] +
        rf['STDs:Hepatitis B'] + rf['STDs:HPV'])

## Check missing values per row

In [7]:
missing_count = []
for idx in range(len(rf)):
    missing = rf.loc[[idx]].isna().sum().sum()
    missing_count.append(missing)

removed = 0
for idx, val in enumerate(missing_count):
    # Remove all rows where more than 15% of the data are missing
    # If 20% is chosen, 106 patients are excluded
    if val >= len(rf.columns)*0.15:
        rf = rf.drop(idx)
        removed+=1
print("{} number of rows were removed".format(removed))

107 number of rows were removed


## Data normalization

In [8]:
#Decision to not use normalization to keep it interpretable
"""column_names  = rf.columns.values.tolist()

#normalize all columns to 0 to 1
values = rf.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
values_scaled = min_max_scaler.fit_transform(values)
rf_norm = pd.DataFrame(values_scaled)

# rename the columns again
rf_norm.columns = column_names"""

'column_names  = rf.columns.values.tolist()\n\n#normalize all columns to 0 to 1\nvalues = rf.values #returns a numpy array\nmin_max_scaler = preprocessing.MinMaxScaler()\nvalues_scaled = min_max_scaler.fit_transform(values)\nrf_norm = pd.DataFrame(values_scaled)\n\n# rename the columns again\nrf_norm.columns = column_names'

## Data imputation
Following the methods described in:
Razali, Nazim & Mostafa, Salama & Mustapha, Aida & Abd Wahab, Mohd Helmy & Ibrahim, Nurul. (2020). Risk Factors of Cervical Cancer using Classification in Data Mining. Journal of Physics: Conference Series. 1529. 022102. 10.1088/1742-6596/1529/2/022102. 

"Missing values for attribute that have integer data type were filled using the sample mean while boolean
were filled using the sample mode."

In [9]:
rf_imp = rf.copy()
# store columns with specific data type
bool_columns = ['Smokes','Hormonal Contraceptives', 'IUD', 'STDs',
                'STDs:condylomatosis', 'STDs:cervical condylomatosis',
                'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
                'STDs:syphilis', 'STDs:pelvic inflammatory disease', 
                'STDs:genital herpes', 'STDs:molluscum contagiosum',
                'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV',
                'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
                'Citology', 'Biopsy']

# replace NaN with mode for columns with dtype bool
for label, col in rf_imp.iteritems():
    if label in bool_columns:
        columns_mode = col.mode()
        rf_imp[label] = col.fillna(columns_mode[0])

        #convert column to bool while we're at it
        rf_imp[label] = rf_imp[label].astype('bool')
    
float_columns = rf_imp.select_dtypes(include=['float64']).columns

# replace NaN with mean for columns with dtype float
for col in float_columns:
    columns_mean = rf_imp[col].mean()
    rf_imp[col] = rf_imp[col].fillna(columns_mean)

# Investigate class imbalances

In [24]:
# get number of patients per group
rf_imp.groupby('Biopsy').count()

Unnamed: 0_level_0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HIV,STDs:Hepatitis B,STDs:HPV,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology
Biopsy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,698,698,698,698,698,698,698,698,698,698,...,698,698,698,698,698,698,698,698,698,698
True,53,53,53,53,53,53,53,53,53,53,...,53,53,53,53,53,53,53,53,53,53


In [10]:
rf_imp['Schiller'].value_counts()

False    678
True      73
Name: Schiller, dtype: int64

In [11]:
rf_imp['Biopsy'].value_counts()

False    698
True      53
Name: Biopsy, dtype: int64

In [12]:
rf_imp['Hinselmann'].value_counts()

False    716
True      35
Name: Hinselmann, dtype: int64

In [13]:
rf_imp['Citology'].value_counts()

False    710
True      41
Name: Citology, dtype: int64

# Correlations between variables

In [14]:
# Plot non-bool parameters
#sns.pairplot(rf_imp[['Age','Number of sexual partners','Num of pregnancies','IUD (years)',
#                 'Hormonal Contraceptives (years)', 'STDs (number)', 'Smokes (years)']])

# Oversampling

## Implement Oversampling

In [15]:
# locate all columns where Biopsy is 1
minority_class = rf_imp.loc[rf_imp['Biopsy'] == 1]

print("Minority class count before oversampling: \n{}\n".format(rf['Biopsy'].value_counts()))

# oversample with factor 12.2
minority_class = minority_class.sample(frac=12.2, replace=True, random_state=1)

# concat dataframes
frames = [rf_imp, minority_class]
rf_oversampled = pd.concat(frames)

print("Minority class count after oversampling: \n{}".format(rf_oversampled['Biopsy'].value_counts()))

Minority class count before oversampling: 
0    698
1     53
Name: Biopsy, dtype: int64

Minority class count after oversampling: 
True     700
False    698
Name: Biopsy, dtype: int64


## SMOTE oversampling

In [16]:
# oversampling using synthetic minority oversampling technique (SMOTE)
# see: Chawla N V, Bowyer K W, Hall L O and Kegelmeyer W P 2002 Journal of Artificial Intelligence Research 16 321-357

# following tutorial here: https://www.kite.com/blog/python/smote-python-imbalanced-learn-for-oversampling/
rf_imp.to_csv('rf_original.csv', index=False, encoding='utf-8')


# for reproducibility purposes
seed = 100
# SMOTE number of neighbors
k = 1

rf = pd.read_csv('rf_original.csv', encoding='utf-8', engine='python')

# make a new df made of all the columns, except the target class
X = rf.loc[:, rf.columns != 'Biopsy']
y = rf.Biopsy
sm = SMOTE(sampling_strategy='auto', k_neighbors=k, random_state=seed)
X_res, y_res = sm.fit_resample(X, y)

rf_smote = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res)], axis=1)

# rename the columns
column_names  = rf.columns.values.tolist()
rf_smote.columns = column_names

print("Minority class count after SMOTE oversampling: \n{}".format(rf_smote['Biopsy'].value_counts()))

Minority class count after SMOTE oversampling: 
False    698
True     698
Name: Biopsy, dtype: int64


# Pivot table

In [17]:
group = 'Dx:Cancer'
column_ = 'Dx:HPV'
agg_function = np.mean 

pivot_sum=pd.pivot_table(rf_smote,index=[group],values=[column_],aggfunc=agg_function)
pivot_sum=pivot_sum.reset_index()
pivot_sum

Unnamed: 0,Dx:Cancer,Dx:HPV
0,False,0.001591
1,True,0.985612


# Classification

## Splitting the dataset into traning and testing sets

In [18]:
# using oversampled data
# split dataset in features and target variable
feature_cols = rf_oversampled.columns[1:len(rf_oversampled.columns)-4]
X = rf_oversampled[feature_cols] # Features
y = rf_oversampled.Biopsy # Target variable

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

## Explainable Boosting Classfier

In [19]:
# fit an ebm
ebm = ExplainableBoostingClassifier()
ebm.fit(X_train, y_train)

ExplainableBoostingClassifier(feature_names=['Number of sexual partners',
                                             'First sexual intercourse',
                                             'Num of pregnancies', 'Smokes',
                                             'Smokes (years)',
                                             'Smokes (packs/year)',
                                             'Hormonal Contraceptives',
                                             'Hormonal Contraceptives (years)',
                                             'IUD', 'IUD (years)', 'STDs',
                                             'STDs (number)',
                                             'STDs:condylomatosis',
                                             'STDs:cervical condylomatosis',
                                             'STDs:vaginal condylomatosis',
                                             'STDs:vulvo...
                                             'continuous', 'continuous',
  

In [20]:
# understand the model
ebm_global = ebm.explain_global()
show(ebm_global)

The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table as dt


In [21]:
# understand individual predictions
ebm_local = ebm.explain_local(X_test, y_test)
show(ebm_local)

In [22]:
# Return the mean accuracy on the given test data and labels
acc = ebm.score(X_test, y_test)
print("The accuracy of the model is: {}".format(acc))

The accuracy of the model is: 0.9214285714285714


In [23]:
# fitting a ebm to the data where smote oversampling was used

# using smote data
# split dataset in features and target variable
feature_cols = rf_smote.columns[1:len(rf_smote.columns)-4]
X_smote = rf_smote[feature_cols] # Features
y_smote = rf_smote.Biopsy # Target variable

# Split dataset into training set and test set
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.3, random_state=1) # 70% training and 30% test

# fit an ebm
ebm_smote = ExplainableBoostingClassifier()
ebm_smote.fit(X_train_smote, y_train_smote)

# understand the model
ebm_smote_global = ebm_smote.explain_global()
show(ebm_smote_global)

# Return the mean accuracy on the given test data and labels
acc_smote = ebm_smote.score(X_test_smote, y_test_smote)
print("The accuracy of the model is: {}".format(acc_smote))

The accuracy of the model is: 0.9498806682577565
