In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Under-sampling

This technique removes data from the majority class to match the amount from the minority class. 
A seed is specified so that the data removed is the same each time it is run on the same dataset and let's see if there is improvement. 

In [2]:
def undersampling_data(df): 
    undersample = RandomUnderSampler(sampling_strategy='all', random_state=42)
    X = df['text']
    Y = df['label']
    X_undersampled, Y_undersampled = undersample.fit_resample(X.values.reshape(-1, 1), Y)
    df = pd.DataFrame({
        'text': X_undersampled.flatten(),
        'label': Y_undersampled})
    return df 
    


# Over-sampling 

Randomly doubles the data of the minority class to equal the amount in the majority class
-with (random_state=0.42) it exceeds the number of the majority class by half with respect to the minority one
-with (sampling_strategy='minority) the minority class is increased to have the same amount as the majority class.

In [3]:
def oversampling_data(df): 
    oversample = RandomOverSampler(sampling_strategy='all',random_state=42)
    
    X = df['text']
    Y = df['label']
    X_oversampled, Y_oversampled = oversample.fit_resample(X.values.reshape(-1, 1), Y)
    df = pd.DataFrame({
        'text': X_oversampled.flatten(),
        'label': Y_oversampled})
    return df 

