### Finding the Best Balancing Technique by Fitting a Classifier on the HealthCare Dataset

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('datasets/healthcare-dataset-stroke-data.csv')

df.head()

In [None]:
df.info()

In [None]:
Y = df.pop('stroke')

In [None]:
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

num_cal = [i for i in df.select_dtypes(include=numerics)]
num_cal

In [None]:
for i in num_cal:
    df[i+"_scaled"] = minmax_scaler.fit_transform(df[i].values.reshape(-1,1))

In [None]:
df.head()

In [None]:
df.drop(columns=num_cal, axis=1, inplace=True)

In [None]:
df.head()

In [None]:
cat_cols = []

for i in df.select_dtypes(include="object"):
    cat_cols.append(i)
cat_cols

In [None]:
df_cat = pd.get_dummies(df[cat_cols])

In [None]:
df_cat.head()

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

num_cal = [i for i in df.select_dtypes(include=numerics)]
num_cal

In [None]:
df_num = df[num_cal]

In [None]:
X = pd.concat([df_cat, df_num], axis=1)
print(X.shape)

In [None]:
df.columns

In [None]:
Y.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=123)

In [None]:
# Defining the LogisticRegression function
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

print("Accuracy of Logistic regression model "\
     "test set: {:.2f}"\
    .format(model.score(X_test, y_test)))

In [None]:
# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [None]:
y_train.value_counts()

In [None]:
print('Percentage of negative class :',\
      (y_train[y_train==1].value_counts()\
       /len(y_train) ) * 100)
print('Percentage of positive class :',\
      (y_train[y_train==0].value_counts()\
       /len(y_train) ) * 100)

### Implementing Random Undersampling and Classification on Our Healthcare Dataset to Find the Optimal Result

In [None]:
"""
Let us first join the train_x and train_y for ease of operation
"""
trainData = pd.concat([X_train,y_train],axis=1)

In [None]:
trainData.head()

In [None]:
ind = trainData[trainData['stroke']==1].index
print(len(ind))

In [None]:
minData = trainData.loc[ind]
print(minData.shape)

In [None]:
ind1 = trainData[trainData['stroke']==0].index
print(len(ind1))

In [None]:
majData = trainData.loc[ind1]
print(majData.shape)
majData.head()

In [None]:
majSample = majData.sample(n=len(ind),random_state = 123)

In [None]:
print(majSample.shape)
majSample.head()

In [None]:
"""
Concatenating both data sets and then shuffling the data set
"""
balData = pd.concat([minData,majSample],axis = 0)

In [None]:
# Shuffling the data set
from sklearn.utils import shuffle
balData = shuffle(balData)
balData.head()

In [None]:
# Making the new X_train and y_train
X_trainNew = balData.iloc[:,0:21]
print(X_trainNew.head())
y_trainNew = balData['stroke']
print(y_trainNew.head())

In [None]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
model1.fit(X_trainNew, y_trainNew)

In [None]:
pred = model1.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set \
for balanced data set: {:.2f}'.format(model1.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [None]:
!pip install smote-variants --user

### Implementing SMOTE on Our HealthCare Dataset to Find the Optimal Result

In [None]:
# Shape before oversampling
print("Before OverSampling count of yes: {}".format(sum(y_train==1)))
print("Before OverSampling count of no: {} \n".format(sum(y_train==0)))

In [None]:
#!pip install smote-variants
import smote_variants as sv
import numpy as np

In [None]:
# Instantiating the SMOTE class
oversampler= sv.SMOTE()

In [None]:
# Creating new training set
X_train_os, y_train_os = oversampler.sample(np.array(X_train), np.array(y_train))

In [None]:
# Shape after oversampling
print('After OverSampling, the shape of train_X: {}'.format(X_train_os.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_os.shape))
print("After OverSampling, counts of label 'Yes': {}".format(sum(y_train_os==1)))
print("After OverSampling, counts of label 'no': {}".format(sum(y_train_os==0)))

In [None]:
# Training the model with Logistic regression model
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression()
model2.fit(X_train_os, y_train_os)

In [None]:
pred = model2.predict(X_test)

In [None]:
print('Accuracy of Logistic regression model prediction on \
test set for Smote balanced data set: {:.2f}'.format(model2.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

### Implementing MSMOTE on Our Insurance Dataset to Find the Optimal Result

In [None]:
# Shape before oversampling
print("Before OverSampling count of yes: {}".format(sum(y_train==1)))
print("Before OverSampling count of no: {} \n".format(sum(y_train==0)))

In [None]:
# Instantiating the MSMOTE class
oversampler= sv.MSMOTE()

In [None]:
# Creating new training set
X_train_os, y_train_os = oversampler.sample(np.array(X_train), np.array(y_train))

In [None]:
# Shape after oversampling
print('After OverSampling, the shape of train_X: {}'.format(X_train_os.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_os.shape))
print("After OverSampling, counts of label 'Yes': {}".format(sum(y_train_os==1)))
print("After OverSampling, counts of label 'no': {}".format(sum(y_train_os==0)))

In [None]:
# Training the model with Logistic regression model
from sklearn.linear_model import LogisticRegression
# Defining the LogisticRegression function
model3 = LogisticRegression()
model3.fit(X_train_os, y_train_os)

In [None]:
pred = model3.predict(X_test)

In [None]:
print("Accuracy of Logistic Regression on test set for MSMOTE balance dataset: {:.2f}".format(model3.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))