In [1]:
data = pd.concat([category_0, category_1_up], axis=0)
#shuffling the data
data = data.sample(frac=1)
data['TARGET_B'].value_counts()

NameError: name 'pd' is not defined

# Handling Data imbalance Classification Model

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

In [None]:
path="Data/" # path to the data files relative to the current directory
# read the data
numerical_df   = pd.read_csv(path+'numerical.csv')
categorical_df = pd.read_csv(path+'categorical.csv')
targets_df     = pd.read_csv(path+'target.csv')

In [None]:
NUMROWS = numerical_df.shape[0] # total number of rows in the csv
NUMROWS

In [None]:
categorical_df

In [None]:
targets_df

In [None]:
numerical_df

In [None]:
targets_df['TARGET_B'].value_counts()

In [None]:
NEG_CLASS_CNT = 90569 # number of negative labels

In [None]:
 # As we can see there is a huge imbalance in the data in the representation
 # of the two categories. Category 0 is represented 90569 times and category 1 is represented 4843 times.

In [None]:
print("The majority class (negative cases) represents {:.2f}% of the data".format(NEG_CLASS_CNT/len(targets_df['TARGET_B'])*100))

In [None]:
data = pd.concat([numerical_df, targets_df], axis=1)

In [None]:
# Dropping target D, or who is more likely to donate
data = data.drop(['TARGET_D'], axis=1)
data.head()

In [None]:
data.shape # we see that we've added a column for the TARGET_B labels

## Downsampling

In <b>downsampling</b>, we randomly sample without replacement from the majority class

In [None]:
category_0 = data[data['TARGET_B'] == 0] # negative class (majority)
category_1 = data[data['TARGET_B'] == 1] # positive class (minority)

In [None]:
print(category_0.shape)
print(category_1.shape)

In [None]:
c1_len = len(category_1)
c1_len

In [None]:
# downsample the majority class to the size of the positive class using pandas sample method
category_0_down = category_0.sample(c1_len)
print(category_0_down.shape)
print(category_1.shape)

In [None]:
# reassemble the data
data = pd.concat([category_0_down, category_1], axis=0)
# shuffle the data
data = data.sample(frac=1) # frac specifies ratio of the shuffled output to the input size. for frac=1 the number of rows is unchanged
data['TARGET_B'].value_counts()

In [None]:
data

## Upsampling

### Method 1 - using pandas sample

In [None]:
# refresh the data
data = pd.concat([numerical_df, targets_df], axis=1)
data = data.drop(['TARGET_D'], axis=1)
category_0 = data[data['TARGET_B'] == 0]
category_1 = data[data['TARGET_B'] == 1]

In [None]:
c0_len = len(category_0)
c1_len = len(category_1)
c0_len,c1_len

In [None]:
# upsample the positive class now
# - pump it up to the length of the negative class by allowing for row repetition
category_1_up = category_1.sample(c0_len, replace=True) # replace=True allows sampling of the same row more than once.
print(category_1_up.shape) # see that it's the same size a category_0

In [None]:
category_1_up

### Method 2: Upsampling using SMOTE

In [None]:
data = pd.concat([numerical_df, targets_df], axis=1)
data = data.drop(['TARGET_D'], axis=1)
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)

<br>install SMOTE using one of the following
- conda install -c conda-forge imbalanced-learn
- conda install -c glemaitre imbalanced-learn

The SMOTE algorithm can be broken down into following steps:

+ Randomly pick a point from the minority class.
+ Compute the k-nearest neighbors (for some pre-specified k) for this point.
+ Add k new points somewhere between the chosen point and each of its neighbors.

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [None]:
# refresh the data and do X,y split
data = pd.concat([numerical_df, targets_df], axis=1)
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)
y.value_counts()

In [None]:
y.value_counts()

In [None]:
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

## Downsampling using Tomeklinks

+ Tomek's links are pairs of very close instances, but of opposite classes. A Tomek’s link exists if two samples are the nearest neighbors of each other.
+ Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process.
+ It does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.

In [None]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority') # resample only the majority class
X_tl, y_tl = tl.fit_resample(X,y) # returns resampled data
y_tl.value_counts() # check the distribution of resampled labels