In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
dataset = pd.read_csv("../data/pca/pca_fingerprint_results.csv").drop(["Unnamed: 0"], axis=1)
dataset

In [None]:
dataset["target"].value_counts()

# 1. Split Data between Ligand and Decoy compounds

In [None]:
ligand_df = dataset[dataset["target"] == 1].reset_index(drop=True)
decoy_df = dataset[dataset["target"] == 0].reset_index(drop=True)

In [None]:
ligand_df.head(10)

In [None]:
decoy_df.head(10)

In [None]:
ligand_df.info() # have 78 entries

In [None]:
decoy_df.info() # have 2322 entries

# 2. Take 150 random samples of decoy compounds

In [None]:
decoy_samples = decoy_df.sample(n=150, random_state=42).reset_index(drop=True) # Take 150 random samples of decoy compound
ligand_samples = ligand_df.copy() # Take all 78 samples of ligand compound

In [None]:
decoy_samples

In [None]:
ligand_samples

# 3. Concatenate data of randomly sampled Decoy with Ligand

In [None]:
sampled_data = pd.concat([decoy_samples, ligand_samples], axis=0).reset_index(drop=True)
sampled_data.head(10)

In [None]:
sampled_data.info() # Have 228 entries, 150 of decoys and 78 of ligands

# 4. Oversampling of Ligand using SMOTE

In [None]:
# Separate features and labels
X = sampled_data.drop(columns=['target'])
y = sampled_data['target']

In [None]:
# Initiate SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=7)

# Do oversampling
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
# Create a new DataFrame with the oversampled data
oversampled_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='target')], axis=1)
oversampled_data

In [None]:
oversampled_data.info() # have 300 entries

In [None]:
oversampled_data["target"].value_counts() # Balanced data between ligand and decoy

In [None]:
oversampled_data.to_csv("../data/resampled/oversampled_data.csv")

# 5. Undersampling of Decoy using RandomRandomSampler

In [None]:
# Initiate undersampler
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=3)

# Do undersamping
X_resampled, y_resampled = undersampler.fit_resample(X, y)

In [None]:
# Create a new DataFrame with the undersampled data
undersampled_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='target')], axis=1).reset_index(drop=True)
undersampled_data.head(10)

In [None]:
undersampled_data.info()

In [None]:
undersampled_data["target"].value_counts() # Balanced data between ligand and decoy

In [None]:
undersampled_data.to_csv("../data/resampled/undersampled_data.csv")