In [None]:
import numpy as np
import pandas as pd
import sklearn
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from matplotlib import pyplot
from numpy import where

### Importing Raw Data

In [None]:
raw = pd.read_csv('RawData.csv')

In [None]:
# Drop patient ID and appointment ID variables
cleaned = raw.drop(['PatientId', 'AppointmentID'], axis = 1)

### Dealing with Date Time Data

In [None]:
# Convert Scheduled Day and Appointment Day to datetime datatypes
cleaned["ScheduledDay"] = pd.to_datetime(cleaned["ScheduledDay"])
cleaned["AppointmentDay"] = pd.to_datetime(cleaned["AppointmentDay"])

In [None]:
# Extracting day of the week
cleaned["ScheduledDoW"] = cleaned["ScheduledDay"].dt.day_name()
cleaned["AppointmentDoW"] = cleaned["AppointmentDay"].dt.day_name()

In [None]:
# New feature for days in between appointment day and scheduled day 
cleaned["DaysInBetween"] = (cleaned["AppointmentDay"] - cleaned["ScheduledDay"].dt.normalize()).dt.days

In [None]:
cleaned['ScheduledM'] = cleaned['ScheduledDay'].dt.month_name()
cleaned['AppointmentM'] = cleaned['AppointmentDay'].dt.month_name()
# Year is not included because all data points are from 2016
# Quarter is not included because all data points are from appointments in months 4,5,6
cleaned['AppointmentisWeekend'] = np.where(cleaned['AppointmentDoW'].isin(['Sunday', 'Saturday']), 1, 0)
cleaned['ScheduledisWeekend'] = np.where(cleaned['ScheduledDoW'].isin(['Sunday', 'Saturday']), 1, 0)
cleaned['ScheduledisPM'] = np.where(cleaned['ScheduledDay'].dt.hour < 12, 0, 1)

In [None]:
cleaned.head(10)

### Checking for Data Entry Errors

In [None]:
# Delete any rows where age is negative 
cleaned = cleaned[cleaned.Age >= 0]

In [None]:
# Delete any rows where the scheduled date is after the appointment date 
cleaned = cleaned[cleaned['DaysInBetween'] >= 0]

In [None]:
cleaned = cleaned.drop(['ScheduledDay', 'AppointmentDay'], axis = 1)

In [None]:
# check for NA in the entire data frame 
print(cleaned.isnull().values.any())

### One Hot Encoding for Categorical Variables

In [None]:
# helper function to perform One Hot Encoding on all the features listed in feature_to_encode
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], drop_first = True)
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [None]:
copyclean = cleaned

In [None]:
features_to_encode = ['Gender', 'Neighbourhood', 'ScheduledDoW', 'AppointmentDoW', 'ScheduledM', 'AppointmentM']

for feature in features_to_encode:
    copyclean = encode_and_bind(copyclean, feature)

In [None]:
cleaned = copyclean

### Convert to Numpy Array

In [None]:
# Move target variable to end of dataframe
target = cleaned.pop("No-show")
cleaned.insert(cleaned.shape[1], "No-show", target)

In [None]:
# Move gender variable to beginning
gender = cleaned.pop("Gender_M")
cleaned.insert(0, "Gender_M", gender)

In [None]:
# for feature selection 
ind = [1, 2, 3, 4, 7, 8, 11, 44, 47, 50, 51, 72, 73, 77, 81, 105, 106, 107, 109, 110]

for i in ind:
    print(cleaned.columns[i])

In [None]:
# Split dataset into feature variables (X) and target variable (y)
X = cleaned.iloc[:, :-1].values
y = cleaned.iloc[:, -1].values

In [None]:
# Encode the target variable 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

### Splitting Data into Train and Test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### Standardization (mean removal and variance scaling)

In [None]:
scaler = sklearn.preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Export as NPY files

In [None]:
np.save('X_train_scaled', X_train_scaled)
np.save('y_train', y_train)
np.save('X_test_scaled', X_test_scaled)
np.save('y_test', y_test)

np.save('X_train', X_train)
np.save('X_test', X_test)

### General Data Analysis

In [None]:
total_rows = cleaned.shape[0]
print("Total number of data samples: ", total_rows)
# Percentage of patients that are male
male = cleaned.apply(lambda x: True if x['Gender_M'] == 1 else False, axis = 1)
m_rows = len(male[male == True].index)
print("Male: ", m_rows/total_rows)
# Percentage of patients that are female
female = cleaned.apply(lambda x: True if x['Gender_M'] == 0 else False, axis = 1)
f_rows = len(female[female == True].index)
print("Female: ", f_rows/total_rows)
# Percentage of patients that have scholarship 
scholarship = cleaned.apply(lambda x: True if x['Scholarship'] == 1 else False, axis = 1)
s_rows = len(scholarship[scholarship == True].index)
print("Scholarship: ", s_rows/total_rows)
# Percentage of patients with no scholarship
noscholarship = cleaned.apply(lambda x: True if x['Scholarship'] == 0 else False, axis = 1)
ns_rows = len(noscholarship[noscholarship == True].index)
print("No scholarship: ", ns_rows/total_rows)

In [None]:
# P(no show | male)
male_noshow = cleaned.apply(lambda x: True if (x['Gender_M'] == 1 and x['No-show'] == "Yes") else False, axis = 1)
m_noshow_rows = len(male_noshow[male_noshow == True].index)
print("Male: ", m_noshow_rows/m_rows)
# P(no show | female)
female_noshow = cleaned.apply(lambda x: True if (x['Gender_M'] == 0 and x['No-show'] == "Yes") else False, axis = 1)
f_noshow_rows = len(female_noshow[female_noshow == True].index)
print("Female: ", f_noshow_rows/f_rows)
# P(no show | scholarship)
scholarship_noshow = cleaned.apply(lambda x: True if x['Scholarship'] == 1 and x['No-show'] == "Yes" else False, axis = 1)
s_noshow_rows = len(scholarship_noshow[scholarship_noshow == True].index)
print("Scholarship: ", s_noshow_rows/s_rows)
# P(no show | no scholarship) 
noscholarship_noshow = cleaned.apply(lambda x: True if x['Scholarship'] == 0 and x['No-show'] == "Yes" else False, axis = 1)
ns_noshow_rows = len(noscholarship_noshow[noscholarship_noshow == True].index)
print("No scholarship: ", ns_noshow_rows/ns_rows)

In [None]:
# Of the no-shows...
noshow = cleaned.apply(lambda x: True if x['No-show'] == "Yes" else False, axis = 1)
total_noshow = len(noshow[noshow == True].index)
print("Total no shows: ", total_noshow)
print("Proportion of no shows: ", total_noshow / total_rows)
# P(male | no show)
print("Male: ", m_noshow_rows / total_noshow)
# P(female | no show)
print("Female: ", f_noshow_rows / total_noshow)
# P(scholarship | no show)
print("Scholarship: ", s_noshow_rows / total_noshow)
# P(no scholarship | no show)
print("No scholarship: ", ns_noshow_rows / total_noshow)

### Preparing Data File for TOF-1
This dataframe will eliminate socioeconomic factors such as neighborhood and scholarship. 


In [None]:
colname = cleaned.columns

In [None]:
col_to_delete = []
for i in range(len(colname)):
    if("Scholarship" in colname[i] or "Neighbourhood" in colname[i]):
        col_to_delete.append(colname[i])

In [None]:
cleaned.drop(labels = col_to_delete, axis = 'columns')

In [None]:
# Split dataset into feature variables (X) and target variable (y)
TOF1_X = cleaned.iloc[:, :-1].values
TOF1_y = cleaned.iloc[:, -1].values

labelencoder_y = LabelEncoder()
TOF1_y = labelencoder_y.fit_transform(TOF1_y)

TOF1_X_train, TOF1_X_test, TOF1_y_train, TOF1_y_test = train_test_split(TOF1_X, TOF1_y, test_size = 0.2)

#scaler = sklearn.preprocessing.StandardScaler().fit(TOF1_X_train)
#TOF1_X_train_scaled = scaler.transform(TOF1_X_train)
#TOF1_X_test_scaled = scaler.transform(TOF1_X_test)

np.save('TOF1_X_train', TOF1_X_train)
np.save('TOF1_X_test', TOF1_X_test)

## Dealing with Unbalanced Datasets

### NS-SMOTE

In [None]:
# for full dataset 
# define pipeline
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.75)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
SMOTE_X_train, SMOTE_y_train = pipeline.fit_resample(X_train, y_train)
#SMOTE_X_train, SMOTE_X_test, SMOTE_y_train, SMOTE_y_test = train_test_split(X, y, test_size = 0.2)

# for TOF-1 dataset 
SMOTE_TOF1_X_train, SMOTE_TOF1_y_train = pipeline.fit_resample(TOF1_X_train, TOF1_y_train)
#SMOTE_TOF1_X_train, SMOTE_TOF1_X_test, SMOTE_TOF1_y_train, SMOTE_TOF1_y_test = train_test_split(SMOTE_TOF1_X, SMOTE_TOF1_y, test_size = 0.2)

In [None]:
# Export datasets 
np.save('ns_SMOTE_X_train', SMOTE_X_train)

np.save('ns_SMOTE_y_train', SMOTE_y_train)

np.save('ns_SMOTE_TOF1_X_train', SMOTE_TOF1_X_train)

### SS-SMOTE

In [None]:
total = np.concatenate((X_train, np.expand_dims(y_train, axis = 1)), axis = 1)

In [None]:
sc = total[:, 2]
X_train_del = np.delete(total, 2, 1)

In [None]:
# define pipeline
over = SMOTE(sampling_strategy=1)
under = RandomUnderSampler(sampling_strategy=1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
SMOTE_X_train_del, SMOTE_sc = pipeline.fit_resample(X_train_del, sc)

counter = Counter(SMOTE_sc)
print(counter)

counter = Counter(SMOTE_sc)
print(counter)

In [None]:
# put the column back into the dataset 
SMOTE_dataset = np.insert(SMOTE_X_train_del, 2, SMOTE_sc, axis = 1)

In [None]:
# Split dataset into feature variables (X) and target variable (y)
s_SMOTE_X_train = SMOTE_dataset[:, :-1]
s_SMOTE_y_train = SMOTE_dataset[:, -1]

In [None]:
# Export datasets 
np.save('scholarship_SMOTE_X_train', s_SMOTE_X_train)
np.save('scholarship_SMOTE_y_train', s_SMOTE_y_train)