# Handling class imbalance by oversampling

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('HIV.csv')
df.head()


Unnamed: 0,smiles,activity,HIV_active
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,CI,0
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,CI,0
4,O=S(=O)(O)CCS(=O)(=O)O,CI,0


In [3]:
df['HIV_active'].value_counts()

0    39684
1     1443
Name: HIV_active, dtype: int64

We have 40,000 examples for negative class and 1500 examples for positive class.

## Oversampling the minority class

In [4]:
from sklearn.utils import resample

In [22]:
# Separate majority and minority classes
df_majority = df[df.HIV_active == 0]
df_minority = df[df.HIV_active == 1]

In [23]:
neg_class = df["HIV_active"].value_counts()[0]
pos_class = df["HIV_active"].value_counts()[1]

# Oversampling multiplier
multiplier = int(neg_class/pos_class) - 1

# Manual override multiplier
multiplier = 7

# Samples needed for minority class
n_min_samples = multiplier * len(df_minority)

In [24]:
# Upsample minority class
df_minority_oversampled = resample(df_minority,
                                 replace=True,    # sample with replacement
                                 n_samples=n_min_samples,  # Increase the minority class size by 5
                                 random_state=42)  # reproducible results

In [25]:
# Combine majority class with upsampled minority class
df_oversampled = pd.concat([df_majority, df_minority_oversampled])
df_oversampled.HIV_active.value_counts()

0    39684
1    10101
Name: HIV_active, dtype: int64

Now we have 40,000 examples from negative class and 10000 samples from positive class.

In [30]:
# Shuffle dataset before saving
final_df = df_oversampled.sample(frac=1, random_state=42)

In [31]:
# Save the oversampled dataframe
final_df.to_csv('oversampled_dataset.csv', index=False)

## Making train and test datasets

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
train_df, test_df = train_test_split(final_df, test_size=0.2)

In [34]:
# Display the resulting DataFrames
print("Training Set:")
print(train_df)
print("\nTest Set:")
print(test_df)

Training Set:
                                                  smiles activity  HIV_active
12455                 CC(=O)OC1CCn2nnc(COC(=O)NC(C)C)c21       CI           0
14703         Cc1ccc2[nH]c(Sc3c([N+](=O)[O-])ncn3C)nc2c1       CI           0
9643                         Cn1c(=O)c2ncc(N)nc2n(C)c1=O       CI           0
8693              CCN(CC)C1=Nc2ccccc2N(c2ccccc2)C(SC)=C1       CI           0
4600               CC1CCCc2c(CC3CC(=O)NC(=O)C3)nc(N)nc21       CI           0
...                                                  ...      ...         ...
9522         NS(=O)(=O)c1ccc(N=NC2Sc3nc4ccccc4n3C2=O)cc1       CI           0
25336       CCN(CC)CCN1CCN=c2c3cc(OC)ccc3n3cnc4ccc1c2c43       CI           0
15701  O=S(=O)(O)c1ccc2c(c1)NN(c1ccc(C=Cc3ccc(N=Nc4cc...       CA           1
24245                   COC(=NN=Cc1cc(OC)ccc1OC)c1ccncc1       CI           0
13168  CC1=CC2c3c([nH]c4ccccc34)C(c3c[nH]c4ccccc34)C2CC1       CI           0

[39828 rows x 3 columns]

Test Set:
             

In [35]:
print("Negative examples in train data: ", train_df["HIV_active"].value_counts()[0])
print("Posiitve examples in train data: ", train_df["HIV_active"].value_counts()[1])

print("Negative examples in test data: ", test_df["HIV_active"].value_counts()[0])
print("Posiitve examples in test data: ", test_df["HIV_active"].value_counts()[1])

Negative examples in train data:  31797
Posiitve examples in train data:  8031
Negative examples in test data:  7887
Posiitve examples in test data:  2070
