## Synthetic Minority Oversampling Technique (SMOTE)
This script demonstrates how to transform an imbalanced dataset (used for binary classification) into a balanced dataset by oversampling from the minority class.

Reference: 
https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

In [71]:
import pandas as pd
import numpy as np
import imblearn
from collections import Counter
from matplotlib import pyplot
from numpy import where
from imblearn.over_sampling import SMOTE

In [72]:
df = pd.read_csv("PD_median_std_all_datasets.csv")
df.drop(df.columns[[0]], axis=1, inplace=True) # remove first column

y = df.Status # labels
x = df.drop('Status',axis=1) # features

In [73]:
# display counts for PD and control, highly imbalanced
counter = Counter(y)
print(counter)

Counter({1: 6706, 0: 360})


In [74]:
oversample = SMOTE()
x_oversampled, y_oversampled = oversample.fit_resample(x, y)



In [75]:
# display counts for PD and control, data is balanced now
counter = Counter(y_oversampled)
print(counter)

Counter({1: 6706, 0: 6706})


In [77]:
# regenerate dataframe with new samples
df = pd.concat([pd.DataFrame(y_oversampled), pd.DataFrame(x_oversampled)], axis=1)
df.columns = ['Status'] + x.columns.to_list()

In [78]:
df.to_csv("PD_median_std_all_datasets_SMOTE.csv")