# Handling class imbalance by oversampling

In [17]:
import pandas as pd

In [18]:
df = pd.read_csv('HIV.csv')
df.head()


Unnamed: 0,smiles,activity,HIV_active
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,CI,0
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,CI,0
4,O=S(=O)(O)CCS(=O)(=O)O,CI,0


In [19]:
df['HIV_active'].value_counts()

0    39684
1     1443
Name: HIV_active, dtype: int64

We have 40,000 examples for negative class and 1500 examples for positive class.

## Oversampling the minority class

In [20]:
from sklearn.utils import resample

In [21]:
# Separate majority and minority classes
df_majority = df[df.HIV_active == 0]
df_minority = df[df.HIV_active == 1]

In [22]:
neg_class = df["HIV_active"].value_counts()[0]
pos_class = df["HIV_active"].value_counts()[1]

# Oversampling multiplier
multiplier = int(neg_class/pos_class) - 1

# Manual override multiplier
multiplier = 7

# Samples needed for minority class
n_min_samples = multiplier * len(df_minority)

In [23]:
# Upsample minority class
df_minority_oversampled = resample(df_minority,
                                 replace=True,    # sample with replacement
                                 n_samples=n_min_samples,  # Increase the minority class size by 5
                                 random_state=42)  # reproducible results

In [24]:
# Combine majority class with upsampled minority class
df_oversampled = pd.concat([df_majority, df_minority_oversampled])
df_oversampled.HIV_active.value_counts()

0    39684
1    10101
Name: HIV_active, dtype: int64

Now we have 40,000 examples from negative class and 10000 samples from positive class.

In [25]:
# Shuffle dataset before saving
final_df = df_oversampled.sample(frac=1, random_state=42)
len(final_df)

49785

## Making train and test datasets

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
train_df, test_df = train_test_split(final_df, test_size=0.2)

In [28]:
# Display the resulting DataFrames
print("Training Set:")
print(train_df)
print("\nTest Set:")
print(test_df)

Training Set:
                                                  smiles activity  HIV_active
14904         Cc1ccc(C2=Nc3ccccc3SC(c3cccc4c3OCO4)C2)cc1       CI           0
26087  CCOC(=O)C1=NN(c2ccccc2)C(=O)C1=CNC(=S)NN=C1C(=...       CI           0
18424              Cc1cn(C2CC(OC=O)C(CO)O2)c(=O)[nH]c1=O       CM           1
685                    [N-]=[N+]=C(S(=O)(=O)O)S(=O)(=O)O       CI           0
14843  N.N[Co-4](N)([OH+][N+](=O)[O-])([OH+][N+](=O)[...       CI           0
...                                                  ...      ...         ...
37809  CCN1CCC(O)(C=Cc2ccc(C(C)C)cc2)C(C(=O)C=Cc2ccc(...       CI           0
2182   CCN(CC)CCn1c(C)nc2c1C(=O)c1nc(C)n(CCN(CC)CC)c1...       CI           0
16193             O=[N+]([O-])c1ccccc1C1SCc2nc3ccccc3n21       CM           1
17030                     COc1ccc(NC(=O)C(=NO)C(C)=O)cc1       CI           0
30380  CCOC(=O)CC(=O)CSc1c(-c2ccccc2)c(=O)n(-c2ccccc2...       CI           0

[39828 rows x 3 columns]

Test Set:
             

In [29]:
print("Negative examples in train data: ", train_df["HIV_active"].value_counts()[0])
print("Posiitve examples in train data: ", train_df["HIV_active"].value_counts()[1])

print("Negative examples in test data: ", test_df["HIV_active"].value_counts()[0])
print("Posiitve examples in test data: ", test_df["HIV_active"].value_counts()[1])

Negative examples in train data:  31756
Posiitve examples in train data:  8072
Negative examples in test data:  7928
Posiitve examples in test data:  2029


In [30]:
import os

# Create directories
os.makedirs('data/train/raw', exist_ok=True)
os.makedirs('data/train/processed', exist_ok=True)
os.makedirs('data/test/raw', exist_ok=True)
os.makedirs('data/test/processed', exist_ok=True)

In [31]:
# Save the datasets to CSV files
train_df.to_csv('data/train/raw/train.csv', index=False)
test_df.to_csv('data/test/raw/test.csv', index=False)