# Install Required Packages via PiP

In [None]:
import sys
!{sys.executable} -m pip install sagemaker pandas numpy imbalanced-learn --upgrade

# Set up SageMaker Environment

In [None]:

import sagemaker 

sess = sagemaker.Session()

bucket = "rzoghbi-medium-smote-article-dataset" # Replace with the bucket where your Data is located.
subfolder = "" # Prefix 



# Define IAM role
import boto3 # AWS Python SDK
from sagemaker import get_execution_role #Defined when you create your instance
import os

role = get_execution_role()

# Note: The execution role is only available when running a notebook within SageMaker. 
# If you run get_execution_role in a notebook not on SageMaker, expect a region error.

print(role)

# Import Required packages

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
import statistics as sts


pd.set_option('display.max_columns',None) 


### Test connection with S3 Bucket, if properly setup, you should see contents of the bucket

In [None]:
conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']
for f in contents:
    print(f['Key'])

# Data Pre-Processing

## Import Dataset

Raw dataset is located on the root of S3 Bucket, we'll import it as a Pandas Dataframe and transform it into a ML suited dataframe

In [None]:
data_key = 'dataset.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

df = pd.read_csv(data_location, sep=',')

# Take a look at the DF
df.head()

In [None]:
df.shape

### Check for Null/Empty values

In [None]:
df.isnull().any()

Only "BMI" and "smoking_status" have Null Values, we will deal with them later

Let's first encode the dataset

### Drop Irrelevant columns

In [None]:
# Drop ID as it is meaningless for the ML Model
df.drop(['id'], axis = 1, inplace = True)

### Encode Features

In [None]:
df.replace({'gender':{'Female':0, 'Male':1, 'Other':2}}, inplace=True)
df.replace({'ever_married':{'No':0, 'Yes':1}}, inplace=True)
df.replace({'Residence_type':{'Urban':0, 'Rural':1}}, inplace=True)
df.replace({'work_type':{'Private':0, 'Self-employed':1, 'children':2, 'Govt_job':3, 'Never_worked':4}}, inplace=True)
df.replace({'smoking_status':{'never smoked':0, 'formerly smoked':1, 'smokes':2}}, inplace=True)

#### Encode age in bands

In [None]:
df.age = df.age.astype(int)



df.loc[ df['age'] <= 16, 'age'] = 0
df.loc[(df['age'] > 16) & (df['age'] <= 32), 'age'] = 1
df.loc[(df['age'] > 32) & (df['age'] <= 48), 'age'] = 2
df.loc[(df['age'] > 48) & (df['age'] <= 64), 'age'] = 3
df.loc[ df['age'] > 64, 'age'] = 4

#### Review encoded dataset

In [None]:
df.head()

### Dealing with null values

In [None]:
df.isnull().any()

In [None]:
df.isnull().sum(axis = 0)

"BMI" has 1462 null values --- 
"smoking_status" has 13292 null values

#### To Fill body mass index Null data we will assume a normal distribution. We will use the mean value and, from there, we will calculate the standard deviation.
#### Then, we will fill NaN with a random value from (mean - standard deviation) to (mean + standard deviation)
#### Assuming a normal distribution, majority of the data should fall in this range.

In [None]:
import statistics as sts
from random import randrange, uniform

# Remove NaNs, place values in a new temporary series
bmi_raw = df.bmi.dropna()

# Mean value
raw_bmi_avg = sts.mean(bmi_raw)
# Standard Deviation
raw_bmi_stdev = sts.mean(bmi_raw)
# Lower possibe value for BMI
lower_treshold = raw_bmi_avg - raw_bmi_stdev
# Higher possible value for BMI
upper_treshold = raw_bmi_avg + raw_bmi_stdev

# Input random value to fill NaN
df.bmi.fillna(uniform(lower_treshold,upper_treshold) , inplace = True)

In [None]:
df.bmi.isna().value_counts()

#### To fill NaN in smoking_status, we will input a random value between 0 and 3
#### This may not be the best practice, but it's fine for our use case. Not for a real problem solving.

In [None]:
df.smoking_status.fillna(randrange(0,3), inplace = True)

In [None]:
df.isnull().any()

#### As you can see, there are no more Null Values in the dataset

#### Let's take a glance at how the dataset is transformed

In [None]:
df.head()

In [None]:
df.describe()

#### Let's apply standarization to our numerical features 'avg_glucose_level' and 'bmi'

In [None]:
toscale = df[['avg_glucose_level' , 'bmi']]

In [None]:
data = toscale.values

In [None]:
from sklearn.preprocessing import StandardScaler
trans = StandardScaler()

In [None]:
standarized = trans.fit_transform(data)

In [None]:
from pandas import DataFrame
scaled = DataFrame(standarized)

In [None]:
scaled.columns = ['avg_glucose_level' , 'bmi']

In [None]:
scaled.head()

In [None]:
df.avg_glucose_level = scaled.avg_glucose_level

In [None]:
df.bmi = scaled.bmi

### Exporting transformed dataset

In [None]:
df

In [None]:
df.head()

#### Now that we have tranformed the dataset, we want to save it to file and export it to S3. So, in case we need to open a new work, we may not have to make all pre-processing again

In [None]:
# We will save this pre processed dataset. For this, copy the dataset into a new object
transformed_df = df

#### We will use XGboost, so we need an extra step: To place the label on the first column

In [None]:
# XGBoost requires label on the first column, let's rotate last column to first

cols = list(transformed_df.columns)
cols = [cols[-1]] + cols[:-1]
transformed_df = transformed_df[cols]

In [None]:
# Review final state
transformed_df.head()

In [None]:
transformed_df.describe()

In [None]:
# Save to CSV the pre processed dataset
transformed_df.to_csv("transformed.csv", header=True, index=False)

In [None]:
# Upload to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(subfolder, "transformed.csv")
).upload_file("transformed.csv")

### Review label class distribution

In [None]:
transformed_df.stroke.value_counts()

#### There are almost 800 positive cases per over 42 thousand negatives. This is a severe skew

### Let's graph and see

In [None]:
transformed_df.stroke.value_counts().plot(kind="bar")

## As a last step in this notebook, we will split our dataset and resample to overcome the imbalance

### We will prepare four pairs of train-validation datasets and one pair for test 

#### Our four pairs of train-validation datasets are:
##### train-validation (imbalanced, original dataset)
##### train-validation SMOTE 
##### train-validation KMeans SMOTE
##### train-validation SVM-SMOTE

##### Test dataset will remain untouched as it will be the same for the evaluation of the training procedures

###   Remember!!   Resampling should be done on the training dataset only, so we will split our dataset and then perform our imbalance remediation

#### First Split: We will split the dataset into a train-validation dataset and a test dataset. Test dataset will remain unchanged, while train-validation will be further split into separate train and validation datasets. First split: Train-Validation 80% - Test 20%

In [None]:
# Train-Test Split for Initial training. Train-Test Split should be donve before resample
train_validation , test = train_test_split(transformed_df, test_size = 0.2, random_state = 42)

#### Second split: We split into train and validation because XGBoost requires separate datasets for train and validate
#### Second split: train 70% validation 30%

In [None]:
# Train-Validation split before SMOTE
train , validation = train_test_split (train_validation, test_size = 0.3, random_state = 42)

#### Train, Validation and Test are saved in CSV and uploaded to S3 to be called by our training object.
#### Also, train-validation is saved into CSV and uploaded for future references

In [None]:
train.to_csv("train.csv", header=False, index=False)
validation.to_csv("validation.csv", header=False, index=False)
test.to_csv("test.csv", header=False, index=False)
train_validation.to_csv("train_validation.csv", header=False, index=False)

### We upload our splitted dataset to S3, as our model will reference them in the future for training

In [None]:
# Upload to S3

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("train", "train.csv")
).upload_file("train.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("validation", "validation.csv")
).upload_file("validation.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("test", "test.csv")
).upload_file("test.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("train_validation", "train_validation.csv")
).upload_file("train_validation.csv")

### Also, we are keeping a second copy of the test dataset, without labels, to make future batch_inferences.

In [None]:
test.head(5)

In [None]:
test_batch = test.drop(['stroke'], axis = 1)

In [None]:
test_batch.to_csv("test_batch.csv", header=False, index=False)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("test_batch", "test_batch.csv")
).upload_file("test_batch.csv")

### Now we have our train-validation dataset withour resampling. Also, we already have our test dataset.

## SMOTE Resampling

### SMOTE library requires separate datasets for data and labels, so we will take our train_validation dataset we got on previous step and from there, resample

#### Retrieve train_validation dataset from S3

In [None]:
data_key = 'train_validation.csv'
subfolder = "train_validation"
data_location = 's3://{}/{}/{}'.format(bucket, subfolder, data_key)

df2 = pd.read_csv(data_location, sep=',', header= None)

Review dataset

In [None]:
df2.head()

In [None]:
df2.columns = ['stroke', 'gender', 'age', 'hypertension', 'heart_disease',
       'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level',
       'bmi', 'smoking_status']

#### Split into data and labels


In [None]:
df2.stroke.value_counts()

In [None]:
X_train = df2.iloc[: , 1:11]
y_train = df2.iloc[:, 0]

#### Resample dataset using default smote. Input: X_train (data), y_train (labels). Output : X_train_smote (resampled data), y_train_smote (labels)

In [None]:
positive_oversampled = 17000

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE (sampling_strategy = {1 : positive_oversampled} , random_state = 42)

# Resample dataset 
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

#### Reassamble dataset, as XGBoost requires labels in first column

In [None]:
# Pivot dataset called XY_train_smote, where we will merge our data and labels
XY_train_smote = X_train_smote
XY_train_smote['stroke'] = y_train_smote

# Put labels on first column
cols = list(XY_train_smote.columns)
cols = [cols[-1]] + cols[:-1]
XY_train_smote = XY_train_smote[cols]

#### How are class distributed after resample:


In [None]:
XY_train_smote.stroke.value_counts()

In [None]:
XY_train_smote.stroke.value_counts().plot(kind="bar")

#### Minority class is oversampled until equalize the majority class


### Train-Validation Split: XGBoost requieres separate dataset for train and validation.

In [None]:
train_smote, validation_smote = train_test_split(XY_train_smote, test_size = 0.3, random_state =42 )

#### We will upload the Datasets to S3 to make them available for our Training Instance


In [None]:
train_smote.to_csv("train_smote.csv", header=False, index=False)
validation_smote.to_csv("validation_smote.csv", header=False, index=False)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("train_smote", "train_smote.csv")
).upload_file("train_smote.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("validation_smote", "validation_smote.csv")
).upload_file("validation_smote.csv")

## KMeans SMOTE Resampling

In [None]:
from imblearn.over_sampling import KMeansSMOTE

sm = KMeansSMOTE(random_state=42, sampling_strategy = {1 : positive_oversampled})
sm.set_params(**{'cluster_balance_threshold' : 0.05 })

In [None]:
X_train_KMSmote, y_train_KMSmote = sm.fit_resample(X_train, y_train)

In [None]:
# Pivot dataset called XY_train_smote, where we will merge our data and labels
XY_train_KMSmote = X_train_KMSmote
XY_train_KMSmote['stroke'] = y_train_KMSmote

# Put labels on first column
cols = list(XY_train_KMSmote.columns)
cols = [cols[-1]] + cols[:-1]
XY_train_KMSmote = XY_train_KMSmote[cols]

#### How are class distributed after resample:

In [None]:
XY_train_KMSmote.stroke.value_counts()

In [None]:
XY_train_KMSmote.stroke.value_counts().plot(kind="bar")

#### Minority class is oversampled until equalize the majority class



### Train-Validation Split: XGBoost requieres separate dataset for train and validation.

In [None]:
train_KMSmote, validation_KMSmote = train_test_split(XY_train_KMSmote, test_size = 0.3, random_state =42 )

#### We will upload the Datasets to S3 to make them available for our Training Instance


In [None]:
train_KMSmote.to_csv("train_KMSmote.csv", header=False, index=False)
validation_KMSmote.to_csv("validation_KMSmote.csv", header=False, index=False)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("train_KMSmote", "train_KMSmote.csv")
).upload_file("train_KMSmote.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("validation_KMSmote", "validation_KMSmote.csv")
).upload_file("validation_KMSmote.csv")

## SVM SMOTE Resampling

In [None]:
from imblearn.over_sampling import SVMSMOTE
sm = SVMSMOTE(sampling_strategy = {1 : positive_oversampled}, random_state=42, k_neighbors = 5, m_neighbors = 10, out_step = 0.5)

In [None]:
X_train_svm, y_train_svm = sm.fit_resample(X_train, y_train)

#### Reassamble dataset, as XGBoost requires labels in first column

In [None]:
# Pivot dataset called XY_train_smote, where we will merge our data and labels
XY_train_svm = X_train_svm
XY_train_svm['stroke'] = y_train_svm

# Put labels on first column
cols = list(XY_train_svm.columns)
cols = [cols[-1]] + cols[:-1]
XY_train_svm = XY_train_svm[cols]

#### How are class distributed after resample:

In [None]:
XY_train_svm.stroke.value_counts()

In [None]:
XY_train_svm.stroke.value_counts().plot(kind="bar")

##### For SVM SMOTE, minority class is oversampled, however not equalized to majority class

### Train-Validation Split: XGBoost requieres separate dataset for train and validation.

In [None]:
train_svm, validation_svm = train_test_split(XY_train_svm, test_size = 0.3, random_state =42 )

#### We will upload the Datasets to S3 to make them available for our Training Instance



In [None]:
train_svm.to_csv("train_svm.csv", header=False, index=False)
validation_svm.to_csv("validation_svm.csv", header=False, index=False)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("train_svm", "train_svm.csv")
).upload_file("train_svm.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("validation_svm", "validation_svm.csv")
).upload_file("validation_svm.csv")

## SMOTE ENN Resampling

#### SMOTE ENN implements a combination of oversampling minority class and undersampling majority class

In [None]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0, sampling_strategy = {1 : positive_oversampled})
X_train_smteenn, y_train_smteenn = smote_enn.fit_resample(X_train, y_train)

In [None]:
y_train_smteenn.value_counts()

In [None]:
XY_train_SmoteENN = X_train_smteenn
XY_train_SmoteENN['stroke'] = y_train_smteenn

# Put labels on first column
cols = list(XY_train_SmoteENN.columns)
cols = [cols[-1]] + cols[:-1]
XY_train_SmoteENN = XY_train_SmoteENN[cols]

In [None]:
XY_train_SmoteENN.stroke.value_counts()

In [None]:
XY_train_SmoteENN.stroke.value_counts().plot(kind="bar")

In [None]:
train_SmoteENN, validation_SmoteENN = train_test_split(XY_train_SmoteENN, test_size = 0.3, random_state =42 )

In [None]:
train_SmoteENN.to_csv("train_SmoteENN.csv", header=False, index=False)
validation_SmoteENN.to_csv("validation_SmoteENN.csv", header=False, index=False)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("train_SmoteENN", "train_SmoteENN.csv")
).upload_file("train_SmoteENN.csv")

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join("validation_SmoteENN", "validation_SmoteENN.csv")
).upload_file("validation_SmoteENN.csv")

## Now that we have our dataset processed and splitted, we will go ahead and train our model and evaluate the effects of the oversampling techniques applied in previous sections

### Please refer to ACV Model Training to proceed