# Over-sampling and Under-sampling
*Víctor Acevedo Vitvitskaya*

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

First, we are going to ilustrate the Tomek link

In [None]:
from sklearn import datasets
import numpy as np
from collections import Counter
X, y = datasets.make_classification(
            n_samples     = 2000,  # number of data points
            n_classes     = 2,      # number of classes
            n_clusters_per_class=2, # The number of clusters per class 
            weights       = [0.03,0.97], # The proportions assigned to each class
            n_features    = 10,     # number of total features 
            n_informative = 2,      # number of informative features 
            n_redundant   = 2,      # number of redundant features
            random_state  = 0                       )
print('Original dataset shape {}'.format(Counter(y)))

In [None]:
from sklearn.decomposition import PCA
import pylab as pl
%pylab inline
def plot_this(X_rs,y_rs,method):
  # Use principal component to condense the 10 features to 2 features
  pca = PCA(n_components=2).fit(X_rs)
  pca_2d = pca.transform(X_rs)
  # Assign colors
  for i in range(0, pca_2d.shape[0]):
    if y_rs[i] == 0:
      c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='r', marker='o')
    elif y_rs[i] == 1:
      c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='g', marker='*')  
  pl.legend([c1, c2], ['Class 1', 'Class 2'])
  pl.title(method)
  pl.axis([-4, 5, -4, 4])  # x axis (-4,5), y axis (-4,4)
  pl.show()
    
plot_this(X,y,'Original')

In [None]:
# TomekLinks
sampler = TomekLinks(ratio='majority',random_state=0)
X_rs, y_rs = sampler.fit_sample(X, y)
print('TomekLinks undersampling {}'.format(Counter(y_rs)))
plot_this(X_rs,y_rs,'TomekLinks')

Let's see an aplication of SMOTe

In [None]:
PATH_FILE = os.path.join("datasets", "BankDataUCI", "bank-full.csv")

In [None]:
bank = pd.read_csv(PATH_FILE)
bank.head()

In [None]:
bank.shape

In [None]:
bank.columns

In [None]:
bank["default"] = bank["default"].map({"no":0,"yes":1})
bank["housing"] = bank["housing"].map({"no":0,"yes":1})
bank["loan"] = bank["loan"].map({"no":0,"yes":1})
bank["y"] = bank["y"].map({"no":0,"yes":1})
bank.education = bank.education.map({"primary": 0, "secondary":1, "tertiary":2})
bank.month = pd.to_datetime(bank.month, format = "%b").dt.month

In [None]:
bank.isnull().sum()

In [None]:
bank.dropna(inplace = True)
bank = pd.get_dummies(bank, drop_first = True)
bank.y.value_counts()

In [None]:
X = bank.drop("y", axis = 1)
y = bank.y

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)
y_train.value_counts()

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

In [None]:
#SMOTE-----------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)


In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))