1. Random Sampling
2. Stratified Sampling
3. Bootstraping
4. Oversampling
5. Undersampling

In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
#Create a sample dataset with 20 datapoints
np.random.seed(42)
data = {
    'Feature1': np.random.randint(1, 100, 20),
    'Feature2': np.random.uniform(0, 1, 20),
    'Label': np.random.choice(['A', 'B', 'C'], size=20)
}
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,Feature1,Feature2,Label
0,52,0.181825,B
1,93,0.183405,B
2,15,0.304242,B
3,72,0.524756,A
4,61,0.431945,B
5,21,0.291229,A
6,83,0.611853,B
7,87,0.139494,C
8,75,0.292145,C
9,75,0.366362,A


### Random Sampling

In [5]:
random_sample = df.sample(n=10, random_state=52)
print('Random Sampling:')
print(random_sample)

Random Sampling:
    Feature1  Feature2 Label
6         83  0.611853     B
2         15  0.304242     B
4         61  0.431945     B
8         75  0.292145     C
17        30  0.170524     B
7         87  0.139494     C
16        88  0.607545     B
9         75  0.366362     A
12         3  0.199674     B
1         93  0.183405     B


### Stratified Sampling

In [6]:
stratified_sample = df.groupby('Label', group_keys=False).apply(lambda x: x.sample(min(len(x), 2), random_state=42))
print('Satratified Sampling:')
print(stratified_sample)

Satratified Sampling:
    Feature1  Feature2 Label
5         21  0.291229     A
13        22  0.514234     A
18        38  0.065052     B
17        30  0.170524     B
8         75  0.292145     C
11        24  0.785176     C


### Bootstraping

In [7]:
bootstrapped_sample = df.sample(n=20, replace=True, random_state=42)
print('Bootstrapped Sampling:')
print(bootstrapped_sample)

Bootstrapped Sampling:
    Feature1  Feature2 Label
6         83  0.611853     B
19         2  0.948886     B
14        53  0.592415     B
10        88  0.456070     C
7         87  0.139494     C
6         83  0.611853     B
18        38  0.065052     B
10        88  0.456070     C
10        88  0.456070     C
3         72  0.524756     A
7         87  0.139494     C
2         15  0.304242     B
1         93  0.183405     B
11        24  0.785176     C
5         21  0.291229     A
1         93  0.183405     B
0         52  0.181825     B
11        24  0.785176     C
11        24  0.785176     C
16        88  0.607545     B


### Oversampling

In [3]:
#pip uninstall imbalanced-learn scikit-learn
#pip install imbalanced-learn scikit-learn

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
#Create imbalanced dataset
np.random.seed(42)
data = {
    'Feature1': np.random.randint(1, 100, 100),
    'Feature2': np.random.uniform(0, 1, 100),
    'Label': np.random.choice(['A', 'B', 'C'], size=100, p=[0.7, 0.2, 0.1])
}
df = pd.DataFrame(data)

In [15]:
#Check class distribution before oversampling
print("Class distribution before oversampling")
print(df['Label'].value_counts())

#Apply the SMOTE for oversampling
oversample = SMOTE(random_state=42)
X, y = df[['Feature1', 'Feature2']], df["Label"]
X_oversampled, y_oversampled = oversample.fit_resample(X, y)

#Create a new dataframe with oversamples data
oversampled_df = pd.concat([pd.DataFrame(X_oversampled, columns=['Feature1', 'Feature2']), pd.DataFrame({'Label':y_oversampled})])

#Check the class distribution after applying SMOTE oversampling
print("\nClass Distribution after Oversampling")
print(oversampled_df['Label'].value_counts())

Class distribution before oversampling
Label
A    71
B    17
C    12
Name: count, dtype: int64

Class Distribution after Oversampling
Label
A    71
B    71
C    71
Name: count, dtype: int64


### Undersampling

In [18]:
class_C.shape

(12, 3)

In [16]:
#Assume A is majority class
class_A = df[df['Label']=='A']
class_B = df[df['Label']=='B']
class_C = df[df['Label']=='C']

undersampled_df = pd.concat([class_A.sample(n=5, random_state=42), class_B, class_C])
print('Undersampling:')
print(undersampled_df)

Undersampling:
    Feature1  Feature2 Label
29        91  0.561277     A
0         52  0.280935     A
67        81  0.222108     A
5         21  0.986887     A
74         4  0.363630     A
1         93  0.542696     B
19         2  0.063558     B
22        21  0.729606     B
27        89  0.713245     B
32        92  0.522733     B
33        60  0.427541     B
34        80  0.025419     B
35        15  0.107891     B
39        62  0.508571     B
50         4  0.633404     B
53        14  0.186570     B
55        90  0.539342     B
64        47  0.006952     B
66        78  0.417411     B
71         2  0.323203     B
85        95  0.278646     B
88        72  0.144895     B
14        53  0.358466     C
18        38  0.330898     C
28        49  0.760785     C
62        44  0.818015     C
76        93  0.962447     C
77        63  0.251782     C
78        18  0.497249     C
81        34  0.036887     C
83        62  0.502679     C
84        14  0.051479     C
92        40  0.672136     C