# Reference Guide for implementing Oversampling Techniques

The following notebook provides a guide for implementing various SMOTE variants.
Also, makes reference to methods for creating artificial datasets that may be useful for demonstration purposes.

### Install Packages and Import Libraries

In [None]:
import sys
!pip install -U imbalanced-learn

In [None]:
print(__doc__)

import seaborn as sns
import matplotlib as plt
from imblearn.datasets import make_imbalance
from sklearn.datasets import make_moons
from sklearn.datasets import make_blobs
import pandas as pd

sns.set_context("poster")

### Generation of artificial datasets

#### We will leverage on scikit-learn make_moons method to create an artificial datasets.

We can adjust the make_moons parameter to create a tailor made dataset

In [None]:
from sklearn.datasets import make_moons
# Use Make moons to generate two interleaving half circles. This will generate a dataset and its labels
X, y = make_moons(n_samples = 5000, shuffle = True, noise = 0.5 , random_state = 42)
# Transform numpy series generated by Make Moons into Pandas Dataframe to better handling
X = pd.DataFrame (data = X, columns= ["feature1", "feature2"])
y = pd.DataFrame (data = y)

Use a plot to have a better view

In [None]:
# Lets graph
ax = X.plot.scatter(
    x="feature1",
    y="feature2",
    c=y,
    colormap="viridis",
    colorbar=False,
)
sns.despine(ax=ax, offset=10)


#### Now, use Make_imbalance to create an imbalanced dataset. We use a 90-10 distribution

In [None]:
X_resampled, y_resampled = make_imbalance (X, y, sampling_strategy = {0:2500, 1:250}, multiplier = 0.1, minority_class = 1)

#### Take a look at the new imbalanced label series

In [None]:
y_resampled.value_counts()

#### Let's graph again

In [None]:
ax = X_resampled.plot.scatter(
    x="feature1",
    y="feature2",
    c=y_resampled,
    colormap="viridis",
    colorbar=False,
)
sns.despine(ax=ax, offset=10)

#### Split into train-test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = .3, random_state=42)

In [None]:
print("We used a 70-30 distribution for train and test")
print(X_train.count())
print("For the labels:")
y_train.value_counts()

#### SMOTE Implementation

In [None]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE
sm = SMOTE (sampling_strategy = "not majority" , random_state = 42)

# Resample dataset 
X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)

#### New target variable distribution

In [None]:
print("New oversampled Dataset")
print(X_train_oversampled.count())

print("Class evenly balanced")
print(y_train_oversampled.value_counts())

#### Visual representation of oversampled dataset

In [None]:
print("Let's graph again")
ax = X_train_oversampled.plot.scatter(
    x="feature1",
    y="feature2",
    c=y_train_oversampled,
    colormap="viridis",
    colorbar=False,
)
sns.despine(ax=ax, offset=10)

### Let's create a second dataset for KMeans and SVM SMOTE

#### This time we will leverage on make_blobs method

In [None]:
X, y = make_blobs (n_samples = 500, centers = 2, n_features = 2, cluster_std = 3, center_box = (-7.0,7.0) , random_state = 42)

X = pd.DataFrame (data = X, columns= ["feature1", "feature2"])
y = pd.DataFrame (data = y)

ax = X.plot.scatter(
    x="feature1",
    y="feature2",
    c=y,
    colormap="viridis",
    colorbar=False,
)
sns.despine(ax=ax, offset=10)




#### Again, let's make it an imbalanced dataset

In [None]:
X_resampled, y_resampled = make_imbalance (X, y, sampling_strategy = {0:250, 1:50}, multiplier = 0.1, minority_class = 1)

#### Split into train-test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = .3, random_state=42)

#### Visual representation

In [None]:
# Let's graph again
ax = X_train.plot.scatter(
    x="feature1",
    y="feature2",
    c=y_train,
    colormap="viridis",
    colorbar=False,
)
sns.despine(ax=ax, offset=10)

#### KMeans SMOTE Implementation

In [None]:
from imblearn.over_sampling import KMeansSMOTE
sm = KMeansSMOTE(random_state=42)
X_over, y_overs = sm.fit_resample(X_train, y_train)

#### Visual representation of oversampled dataset

In [None]:
print("Let's graph again")
ax = X_over.plot.scatter(
    x="feature1",
    y="feature2",
    c=y_overs,
    colormap="viridis",
    colorbar=False,
)
sns.despine(ax=ax, offset=10)

#### KMeans SMOTE Implementation

In [None]:
from imblearn.over_sampling import SVMSMOTE
sm = SVMSMOTE(random_state=42)
X_res_svm, y_res_svm = sm.fit_resample(X_train, y_train)

#### Visual representation of oversampled dataset

In [None]:
ax = X_res_svm.plot.scatter(
    x="feature1",
    y="feature2",
    c=y_res_svm,
    colormap="viridis",
    colorbar=False,
)
sns.despine(ax=ax, offset=10)