# PRE-PROCESSING (NEW DATA)
In this notebook we pre-process the new dataset containing all the features extracted in the *Feature Extraction* file.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
import joblib

In [None]:
# Set Pandas to display all columns
pd.set_option('display.max_columns', None)

In [None]:
def load_tsv_as_df(folder, filename):
    if not filename.endswith('.tsv'):
        filename += '.tsv'
    path = os.path.join(folder, filename)
    df = pd.read_csv(path, sep='\t')
    return df

In [None]:
def save_df_as_tsv(df, folder, filename):
    os.makedirs(folder, exist_ok=True)
    if not filename.endswith('.tsv'):
        filename += '.tsv'
    path = os.path.join(folder, filename)
    df.to_csv(path, sep='\t', index=False)
    print(f"Saved TSV to: {path}")

# Data Loading
The tsv file is loaded directly into a pandas dataframe.

In [None]:
# Specify your folder path
folder_path = '/content/drive/MyDrive'

# Path to the TSV file
tsv_path = f'{folder_path}/Structural Bioinfo PROJECT/datasets'

# Load df
combined_df = load_tsv_as_df(tsv_path, 'combinated_df_new.tsv')

In [None]:
# Show the result
print("Shape of the dataset:", combined_df.shape)
combined_df.head()

Shape of the dataset: (2968986, 44)


Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,s_3di_state,s_3di_letter,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,t_3di_state,t_3di_letter,Interaction,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y
0,1b0y,A,28,,R,H,0.056,-1.127,-0.711,1.538,-0.055,1.502,0.44,2.897,13.0,N,A,31,,A,H,0.434,-1.221,-0.526,-0.591,-1.302,-0.733,1.57,-0.146,9.0,J,VDW,1,0.378,2.129,1.247,2.235,1.13,3.043,4.967376,-1.106118,-1.339661,-1.140001,-2.006822
1,1b0y,A,27,,E,S,0.531,-1.622,0.483,1.357,-1.453,1.477,0.113,-0.837,1.0,B,A,31,,A,H,0.434,-1.221,-0.526,-0.591,-1.302,-0.733,1.57,-0.146,9.0,J,HBOND,1,0.097,1.948,0.151,2.21,1.457,0.691,6.062192,-0.135632,-1.891373,-1.140001,-2.006822
2,1b0y,A,47,,Q,T,0.46,-0.986,-0.566,0.931,-0.179,-3.005,-0.503,-1.853,13.0,N,A,84,,A,P,0.472,-1.557,2.544,-0.591,-1.302,-0.733,1.57,-0.146,2.0,C,HBOND,1,0.012,1.522,1.123,2.272,2.073,1.707,6.714707,-1.106118,-1.339661,0.494826,-0.420486
3,1b0y,A,40,,E,G,0.546,-1.86,-0.062,1.357,-1.453,1.477,0.113,-0.837,12.0,M,A,45,,N,G,0.274,-1.941,0.412,0.945,0.828,1.299,-0.169,0.933,13.0,N,HBOND,1,0.272,0.412,2.281,0.178,0.282,1.77,8.626575,0.690114,-1.255422,-1.106118,-1.339661
4,1b0y,A,37,,P,-,0.551,-0.88,2.32,0.189,2.081,-1.628,0.421,-1.392,14.0,O,A,40,,E,G,0.546,-1.86,-0.062,1.357,-1.453,1.477,0.113,-0.837,12.0,M,HBOND,1,0.005,1.168,3.534,3.105,0.308,0.555,5.669582,2.149514,-0.802992,0.690114,-1.255422


We can immediately appreciate the added features/columns.

Assign the "MISSING" value to the interaction labels that are missing/unclassified.

In [None]:
combined_df['Interaction'] = combined_df['Interaction'].fillna('MISSING')
combined_df['Interaction'] = combined_df['Interaction'].astype('category')
combined_df['Interaction'].cat.categories.tolist()

['HBOND',
 'IONIC',
 'MISSING',
 'PICATION',
 'PIHBOND',
 'PIPISTACK',
 'SSBOND',
 'VDW']

# Exploratory Data Analysis and Preprocessing

## Missing values

The number of missing counts per feature and the features with missing values are displayed.

In [None]:
missing_counts = combined_df.isnull().sum()
print(missing_counts[missing_counts > 0])

s_ss8              31
s_rsa              63
s_phi           17807
s_psi            6736
s_3di_state     37025
s_3di_letter    37025
t_ss8              56
t_rsa              75
t_phi            6167
t_psi           21474
t_3di_state     44036
t_3di_letter    44036
delta_rsa         138
ca_distance     50115
s_centroid_x    37025
s_centroid_y    37025
t_centroid_x    44036
t_centroid_y    44036
dtype: int64


We can see that the missing 3di state variables propagate, expectedly, to the centroids columns. Finally, the CA distance also shows a relevant amount of missing values.

The same type of analysis we did on the original dataset is carried out on the new one:

In [None]:
# Percentage of missing values per column, grouped by Interaction
missing_pct_by_interaction = combined_df.groupby('Interaction').apply(lambda g: g.isnull().mean() * 100).round(2)
print(missing_pct_by_interaction)

  missing_pct_by_interaction = combined_df.groupby('Interaction').apply(lambda g: g.isnull().mean() * 100).round(2)


             pdb_id  s_ch  s_resi  s_ins  s_resn  s_ss8  s_rsa  s_phi  s_psi  \
Interaction                                                                    
HBOND           0.0   0.0     0.0    0.0     0.0    0.0   0.00   0.56   0.25   
IONIC           0.0   0.0     0.0    0.0     0.0    0.0   0.00   0.57   0.28   
MISSING         0.0   0.0     0.0    0.0     0.0    0.0   0.01   0.63   0.21   
PICATION        0.0   0.0     0.0    0.0     0.0    0.0   0.00   0.46   0.28   
PIHBOND         0.0   0.0     0.0    0.0     0.0    0.0   0.00   0.78   0.22   
PIPISTACK       0.0   0.0     0.0    0.0     0.0    0.0   0.00   0.40   0.30   
SSBOND          0.0   0.0     0.0    0.0     0.0    0.0   0.00   1.05   0.19   
VDW             0.0   0.0     0.0    0.0     0.0    0.0   0.00   0.63   0.22   

             s_a1  s_a2  s_a3  s_a4  s_a5  s_3di_state  s_3di_letter  t_ch  \
Interaction                                                                  
HBOND         0.0   0.0   0.0   0.0   0.0  

  missing_pct_by_interaction = combined_df.groupby('Interaction').apply(lambda g: g.isnull().mean() * 100).round(2)


Similar conclusions to the ones we made for the original data can be drawn from the added features:
* Missing centroids propagate identically to 3di variables;
* Missing CA distance does not seem to be connected to the interaction type.

 Here too, as with the original data, we eliminate all incomplete observations.

In [None]:
# Drop non-full observations
combined_df = combined_df.dropna()

In [None]:
interaction_counts = combined_df['Interaction'].value_counts()
print(interaction_counts)

Interaction
MISSING      1038001
HBOND        1013182
VDW           706972
PIPISTACK      36836
IONIC          33841
PICATION        8425
SSBOND          2000
PIHBOND         1707
Name: count, dtype: int64


The resulting full dataset size is still satisfactory, despite the final number of rows being slightly smaller than with the original dataset, due to some additional observations lost with the CA distance.



In [None]:
total_interactions = interaction_counts.sum()
print(f"Total number of interactions: {total_interactions}")

Total number of interactions: 2840964


## Features removal

Some features are removed immediately for computational efficieny:
* *3di_letter*  and *3di_state* are discarded for both source and target, since here we explore the alternative use of 3di centroid coordinates

Others are kept for the next steps and deleted afterwards:
* insertion code for both source and target residues are excluded as they don't carry much bio-physically meaningful information;
* PDB IDs;
* chain IDs which we replace with the *same-chain* variable.

In [None]:
combined_df.drop(columns=['s_3di_letter', 't_3di_letter', 's_3di_state' ,'t_3di_state'], inplace=True)

Categorical variables are promptly encoded as so:

In [None]:
categorical_cols = ['s_ch', 't_ch', 's_resn', 't_resn', 's_ins', 't_ins', 's_ss8', 't_ss8']

# Convert specified columns to 'category' dtype
combined_df[categorical_cols] = combined_df[categorical_cols].astype('category')

In [None]:
combined_df.describe(include='all')

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,Interaction,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y
count,2840964,2840964,2840964.0,2840964.0,2840964,2840964,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964,2840964.0,2840964.0,2840964,2840964,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0,2840964.0
unique,3883,33,,12.0,20,9,,,,,,,,,33,,12.0,20,9,,,,,,,,,8,,,,,,,,,,,,
top,5ej8,A,,,L,H,,,,,,,,,A,,,L,H,,,,,,,,,MISSING,,,,,,,,,,,,
freq,15539,1700310,,2840744.0,261251,1183036,,,,,,,,,1628688,,2840726.0,267782,1159176,,,,,,,,,1038001,,,,,,,,,,,,
mean,,,208.4359,,,,0.1706522,-1.388896,0.5895814,0.02157048,-0.09003262,-0.06647259,0.2838041,-0.05991481,,243.2722,,,,0.1821581,-1.369148,0.586884,0.03097279,-0.129919,0.02861175,0.2755556,0.04168819,,0.9597985,0.1599919,1.021123,1.03704,2.341992,0.9438467,1.896307,6.363266,0.07002994,-0.9661942,0.8203936,-0.8949053
std,,,247.6614,,,,0.2002871,0.7584639,1.570682,1.01045,0.9640995,2.163947,0.8836356,1.647287,,252.9831,,,,0.2095774,0.8028762,1.518786,1.02698,0.9374948,2.11274,0.8952989,1.626628,,0.1964315,0.1627011,0.8177226,0.8067394,1.896016,0.7597054,1.463754,1.54997,1.445245,1.052048,1.441751,1.099345
min,,,-24.0,,,,0.0,-3.141,-3.142,-1.343,-1.524,-4.76,-2.128,-3.242,,-27.0,,,,0.0,-3.141,-3.142,-1.343,-1.524,-4.76,-2.128,-3.242,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.571041,-2.881375,-2.165999,-2.881375,-2.165999
25%,,,66.0,,,,0.007,-1.863,-0.728,-1.006,-0.987,-1.505,-0.277,-1.262,,97.0,,,,0.009,-1.856,-0.713,-1.006,-0.987,-1.505,-0.277,-0.912,,1.0,0.026,0.352,0.397,0.772,0.328,0.715,5.279086,-1.106118,-2.006822,-0.1356317,-2.006822
50%,,,143.0,,,,0.085,-1.216,-0.293,-0.228,-0.279,0.533,0.393,-0.146,,181.0,,,,0.095,-1.246,-0.183,-0.228,-0.279,0.533,0.393,-0.146,,1.0,0.107,0.791,0.885,2.063,0.802,1.674,5.983212,0.4948263,-1.303724,0.778631,-1.255422
75%,,,266.0,,,,0.287,-1.077,2.302,0.945,0.465,1.502,1.045,1.313,,307.0,,,,0.303,-1.086,2.29,0.945,0.326,1.502,1.045,1.313,,1.0,0.25,1.571,1.454,3.636,1.441,2.786,6.789931,0.778631,-0.3599843,2.149514,-0.1502629


Double check feature types:

In [None]:
print(combined_df.dtypes)

pdb_id               object
s_ch               category
s_resi                int64
s_ins              category
s_resn             category
s_ss8              category
s_rsa               float64
s_phi               float64
s_psi               float64
s_a1                float64
s_a2                float64
s_a3                float64
s_a4                float64
s_a5                float64
t_ch               category
t_resi                int64
t_ins              category
t_resn             category
t_ss8              category
t_rsa               float64
t_phi               float64
t_psi               float64
t_a1                float64
t_a2                float64
t_a3                float64
t_a4                float64
t_a5                float64
Interaction        category
same_chain            int64
delta_rsa           float64
delta_atchley_1     float64
delta_atchley_2     float64
delta_atchley_3     float64
delta_atchley_4     float64
delta_atchley_5     float64
ca_distance         

##  Missing/unclassified interactions
Unclassified interactions are removed.

In [None]:
# Remove unclassified observations
combined_df = combined_df[combined_df['Interaction'] != 'MISSING']

# Remove unused category from the column
combined_df['Interaction'] = combined_df['Interaction'].cat.remove_unused_categories()

In [None]:
print("Resulting dataset shape:", combined_df.shape)

Resulting dataset shape: (1802963, 40)


## Multiple interactions

 As we know, it is possible for residues to have more than one interaction type, thus the dataset might contain multiple observations referring to the same residue pair, each identifying a different interaction type between the two.

 To explore the entity of these multiple interactions we count the duplicate observations based on a subset of features that are sufficient to uniquely identify each single residues pair: the PDB ID, residues ID, residues insertion codes and chain IDs.

In [None]:
id_cols = ['pdb_id', 's_resi', 't_resi', 's_ch', 't_ch', 's_ins', 't_ins']
combined_df[id_cols].drop_duplicates().shape

(1393382, 7)

In [None]:
grouped_df = combined_df.groupby(id_cols, sort=False, observed=True)['Interaction'].apply(list).reset_index()

In [None]:
grouped_df.head()

Unnamed: 0,pdb_id,s_resi,t_resi,s_ch,t_ch,s_ins,t_ins,Interaction
0,1b0y,28,31,A,A,,,[VDW]
1,1b0y,27,31,A,A,,,[HBOND]
2,1b0y,47,84,A,A,,,[HBOND]
3,1b0y,40,45,A,A,,,[HBOND]
4,1b0y,37,40,A,A,,,"[HBOND, VDW]"


In [None]:
grouped_df.shape

(1393382, 8)

# Train/Test Split
A 80/20 training/test sets split is implemented. Subsequently, another 80/20 split is performed to exctract a validation set.

Each split is performed following two criteria:
1. Interactions in the same PDB structure (same *pdb_id*) are maintained together in the same set, preventing structures dispersion;
2. Each split is carried out stratifying the sets based on the interaction type labels, i.e. the original interaction proportions are maintained in all sets.

In [None]:
# Create a summary DataFrame where each row is one pdb_id
# and is labeled with its dominant Interaction type (for stratification)
pdb_summary = combined_df.groupby('pdb_id')['Interaction'].agg(lambda x: x.mode()[0]).reset_index()

# Train-test split on pdb_ids, stratified by dominant interaction
train_pdbs, test_pdbs = train_test_split(
    pdb_summary['pdb_id'],
    test_size=0.2,
    random_state=42,
    stratify=pdb_summary['Interaction']
)

# Filter the original DataFrame based on selected pdb_ids
train_df_temp = combined_df[combined_df['pdb_id'].isin(train_pdbs)]  # temporary df
test_df = combined_df[combined_df['pdb_id'].isin(test_pdbs)]

In [None]:
# Re-run on temporary training set summary DataFrame where each row is one pdb_id
# and is labeled with its dominant Interaction type (for stratification)
pdb_summary = train_df_temp.groupby('pdb_id')['Interaction'].agg(lambda x: x.mode()[0]).reset_index()

# Train-validation split on pdb_ids, stratified by dominant interaction
train_pdbs, val_pdbs = train_test_split(
    pdb_summary['pdb_id'],
    test_size=0.2,
    random_state=42,
    stratify=pdb_summary['Interaction']
)

# Filter the original DataFrame based on selected pdb_ids
train_df = train_df_temp[train_df_temp['pdb_id'].isin(train_pdbs)]
val_df = train_df_temp[train_df_temp['pdb_id'].isin(val_pdbs)]

The procedure ultimately yields a 64/16/20 training/validation/test split:

In [None]:
print("Shape of Train dataset:", train_df.shape)
print("Shape of Test dataset:", test_df.shape)
print("Shape of Validation dataset:", val_df.shape)

Shape of Train dataset: (1159045, 40)
Shape of Test dataset: (351816, 40)
Shape of Validation dataset: (292102, 40)


In [None]:
df_split = []
for df in [train_df, val_df, test_df]:
    df_split.append(round(df.shape[0] / combined_df.shape[0], 2))
print("Split sizes: {}".format(df_split))

Split sizes: [0.64, 0.16, 0.2]


Double check stratification:

In [None]:
total_interaction_counts = train_df['Interaction'].value_counts() + test_df['Interaction'].value_counts() + val_df['Interaction'].value_counts()

train_interaction_counts = train_df['Interaction'].value_counts() / total_interaction_counts
print("Train Set -", train_interaction_counts, "\n")
val_interaction_counts = val_df['Interaction'].value_counts() / total_interaction_counts
print("Validation set -", val_interaction_counts, "\n")
test_interaction_counts = test_df['Interaction'].value_counts() / total_interaction_counts
print("Test set -", test_interaction_counts)

Train Set - Interaction
HBOND        0.643552
IONIC        0.651961
PICATION     0.634659
PIHBOND      0.650849
PIPISTACK    0.633239
SSBOND       0.676500
VDW          0.641907
Name: count, dtype: float64 

Validation set - Interaction
HBOND        0.161999
IONIC        0.159067
PICATION     0.159644
PIHBOND      0.144112
PIPISTACK    0.163563
SSBOND       0.177000
VDW          0.162121
Name: count, dtype: float64 

Test set - Interaction
HBOND        0.194450
IONIC        0.188972
PICATION     0.205697
PIHBOND      0.205038
PIPISTACK    0.203198
SSBOND       0.146500
VDW          0.195972
Name: count, dtype: float64


# Data augmentation and encoding


# Training set

Features and target are separated for next steps:

In [None]:
# Separate features and target
X_train = train_df.drop(columns=['Interaction'])  # features
y_train = train_df['Interaction']  # target

## SMOTE NC



To augment data and mitigate underfitting of under-represented labels, we apply SMOTE NC, which is a version of SMOTE capable of handling properly categorical variables. We do so before OHE for two reasons:
1. Requres categorical variables, not OHE;
2. The dataset is smaller in size as no dummy variables are created yet.

It is applied on the training set only and augmented proportions are manually set based on the original frequencies of each class.

In [None]:
interaction_counts_train = train_df['Interaction'].value_counts()
print(interaction_counts_train)

Interaction
HBOND        652035
VDW          453810
PIPISTACK     23326
IONIC         22063
PICATION       5347
SSBOND         1353
PIHBOND        1111
Name: count, dtype: int64


Based on these counts we can define manually a sampling strategy dictionary:

In [None]:
sampling_strategy = {}

for label, count in interaction_counts_train.items():
    if count >= 100000:
        continue  # do not augment
    elif count >= 10000:
        sampling_strategy[label] = count * 5  # 5x for intermediate
    else:
        sampling_strategy[label] = count * 10  # 10x for rare

In [None]:
# Separate the pdb_id from the features
pdb_ids = X_train['pdb_id'].reset_index(drop=True)
X = X_train.drop(columns=['pdb_id']).reset_index(drop=True)

# Retrieve updated categorical columns
categorical_cols = X.select_dtypes(include='category').columns.tolist()
categorical_indices = [X.columns.get_loc(col) for col in categorical_cols]

# Run SMOTE
smote_nc = SMOTENC(
    categorical_features=categorical_indices,
    sampling_strategy=sampling_strategy,
    random_state=42
)

# Resample training set
X_train_SMOTE, y_train_SMOTE = smote_nc.fit_resample(X.values, y_train.values)

# Determine how many new samples were generated
n_original = len(X)
n_resampled = len(X_train_SMOTE)
n_new = n_resampled - n_original

# Construct new pdb_id for synthetic samples
# For example, keep original pdb_ids, and generate synthetic pdb_ids like 'synthetic_0', 'synthetic_1', ...
new_pdb_ids = pd.Series([f"synthetic_{i}" for i in range(n_new)])

# Combine old and new pdb_ids
pdb_ids_resampled = pd.concat([pdb_ids, new_pdb_ids], ignore_index=True)

# Build the final dataframe
y_train_SMOTE = pd.Series(y_train_SMOTE, name='Interaction').astype('category')
X_train_SMOTE = pd.concat([pd.DataFrame(X_train_SMOTE, columns=X.columns), y_train_SMOTE], axis=1)
X_train_SMOTE['pdb_id'] = pdb_ids_resampled

# Reorder columns to have pdb_id first
cols = ['pdb_id'] + [col for col in X_train.columns if col not in ['pdb_id', 'Interaction']] + ['Interaction']
X_train_SMOTE = X_train_SMOTE[cols]

In [None]:
X_train_SMOTE.tail()

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y,Interaction
1410895,synthetic_251850,A,44.0,,C,E,0.286285,-1.841577,2.246911,-1.343,0.465,-0.862,-1.02,-0.255,A,50.790291,,C,E,0.0,-2.168915,2.178674,-1.343,0.465,-0.862,-1.02,-0.255,1.0,0.286285,0.0,0.0,0.0,0.0,0.0,3.949307,-1.076392,-0.462708,2.492444,0.81338,SSBOND
1410896,synthetic_251851,A,1614.797371,,C,P,0.112415,-2.153707,0.488697,-1.343,0.465,-0.862,-1.02,-0.255,A,1628.168049,,C,E,0.216362,-2.016101,2.232445,-1.343,0.465,-0.862,-1.02,-0.255,1.0,0.103947,0.0,0.0,0.0,0.0,0.0,5.619668,-0.490302,-0.928267,1.024432,0.53436,SSBOND
1410897,synthetic_251852,X,173.0,,C,E,0.0,-2.028557,2.190991,-1.343,0.465,-0.862,-1.02,-0.255,L,180.0,,C,E,0.010308,-2.332413,2.726598,-1.343,0.465,-0.862,-1.02,-0.255,1.0,0.010308,0.0,0.0,0.0,0.0,0.0,3.866001,2.139425,0.048612,2.917936,1.143728,SSBOND
1410898,synthetic_251853,J,176.0,,C,E,0.0,-2.165016,2.76973,-1.343,0.465,-0.862,-1.02,-0.255,N,182.0,,C,E,0.0,-2.478185,2.637215,-1.343,0.465,-0.862,-1.02,-0.255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.834432,2.139425,0.048612,2.917936,1.143728,SSBOND
1410899,synthetic_251854,A,31.951792,,C,E,0.146714,-2.745984,2.699681,-1.343,0.465,-0.862,-1.02,-0.255,A,73.048208,,C,P,0.158608,-1.447605,2.34985,-1.343,0.465,-0.862,-1.02,-0.255,1.0,0.097323,0.0,0.0,0.0,0.0,0.0,5.167603,-1.050515,-0.052663,0.634809,-0.080429,SSBOND


The augmented training set has size:

In [None]:
print("Augmented set size:", X_train_SMOTE.shape)
print("Number of added observations:", abs(X_train_SMOTE.shape[0] - X_train.shape[0]))

Augmented set size: (1410900, 40)
Number of added observations: 251855


with the following labels frequencies:

In [None]:
print(Counter(y_train_SMOTE))

Counter({'HBOND': 652035, 'VDW': 453810, 'PIPISTACK': 116630, 'IONIC': 110315, 'PICATION': 53470, 'SSBOND': 13530, 'PIHBOND': 11110})


## MultiLabel Binarizer

In the **Multiple interactions** section we explored the presence of residue pairs that map into multiple interactions of different type. In the current datasets, these pairs constitute an observation for each interaction type, effectively over-crowding the dataframe.

The MultiLabel Binarizer is used to handle this issue by merging such observations into a single one containing all the occurring interactions as dummy variables (0/1).

In [None]:
feature_cols = [col for col in X_train_SMOTE.columns if col != 'Interaction']
grouped_train_df = X_train_SMOTE.groupby(feature_cols, sort=False, observed=True)['Interaction'].apply(set).reset_index()

# Reconstruct X and y
# The feature columns are all columns except the last one ('Interaction') after reset_index
X_train_MLB = grouped_train_df.iloc[:, :-1]
y_train_grouped = grouped_train_df.iloc[:, -1].tolist()

# Apply MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train_MLB = mlb.fit_transform(y_train_grouped)

In [None]:
grouped_train_df.shape

(1147498, 40)

In [None]:
grouped_train_df.head()

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_ch,t_resi,t_ins,t_resn,t_ss8,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y,Interaction
0,1bs9,A,121.0,,A,H,0.0,-1.203,-0.763,-0.591,-1.302,-0.733,1.57,-0.146,A,124.0,,M,H,0.128,-1.56,-0.151,-0.663,-1.524,2.219,-1.005,1.212,1.0,0.128,0.072,0.222,2.952,2.575,1.358,5.185897,2.305979,-1.498816,2.149514,-0.802992,{HBOND}
1,1bs9,A,201.0,,F,H,0.142,-1.109,-0.835,-1.006,-0.59,1.891,-0.397,0.412,A,204.0,,S,H,0.646,-1.137,-0.508,-0.228,1.399,-4.76,0.67,-2.647,1.0,0.504,0.778,1.989,6.651,1.067,3.059,5.103776,2.305979,-1.498816,0.778631,-2.165999,"{VDW, HBOND}"
2,1bs9,A,190.0,,Y,H,0.0,-1.222,-0.397,0.26,0.83,3.097,-0.838,1.512,A,194.0,,Y,H,0.081,-2.08,0.042,0.26,0.83,3.097,-0.838,1.512,1.0,0.081,0.0,0.0,0.0,0.0,0.0,5.500035,-0.135632,-1.891373,-0.135632,-1.891373,"{PIPISTACK, HBOND}"
3,1bs9,A,94.0,,E,H,0.0,-0.986,-0.806,1.357,-1.453,1.477,0.113,-0.837,A,105.0,,D,B,0.012,-2.537,1.32,1.05,0.302,-3.656,-0.259,-3.242,1.0,0.012,0.307,1.755,5.133,0.372,2.405,8.81394,-1.106118,-1.339661,-1.106118,-1.339661,"{VDW, HBOND}"
4,1bs9,A,69.0,,A,H,0.066,-1.051,-0.818,-0.591,-1.302,-0.733,1.57,-0.146,A,73.0,,N,H,0.248,-1.136,-0.707,0.945,0.828,1.299,-0.169,0.933,1.0,0.182,1.536,2.13,2.032,1.739,1.079,5.717558,-1.106118,-1.339661,-1.106118,-1.339661,{HBOND}


In [None]:
print("Observations merged: ", X_train_SMOTE.shape[0] - X_train_MLB.shape[0])

Observations merged:  263402


Column names are assigned to the MLB numpy array output:

In [None]:
# Assign column names to target
y_train_final = pd.DataFrame(y_train_MLB, columns=mlb.classes_)
y_train_final.head()

Unnamed: 0,HBOND,IONIC,PICATION,PIHBOND,PIPISTACK,SSBOND,VDW
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,1
2,1,0,0,0,1,0,0
3,1,0,0,0,0,0,1
4,1,0,0,0,0,0,0


In [None]:
X_train_MLB.drop(columns=['pdb_id', 's_ch', 't_ch', 's_ins', 't_ins'], inplace=True)  # s_resi, t_resi

In [None]:
# Restore categorical features after dropping some columns
categorical_cols = [
    's_resn', 't_resn', 's_ss8', 't_ss8'
]

## One-Hot Encoding

We proceed with the One-Hot Encoding (OHE) of the categorical variables in the dataset.

In [None]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit training
X_train_OHE = encoder.fit_transform(X_train_MLB[categorical_cols])

# Drop original and append encoded features
X_train_final = X_train_MLB.drop(columns=categorical_cols).reset_index(drop=True)

# Get new column names
encoded_cols = encoder.get_feature_names_out(categorical_cols)

X_train_final = pd.concat([
    X_train_final,
    pd.DataFrame(X_train_OHE, columns=encoded_cols)
], axis=1)

In [None]:
joblib.dump(encoder, f'{folder_path}/Structural Bioinfo PROJECT/onehot_encoder.pkl')

['/content/drive/MyDrive/Structural Bioinfo PROJECT/onehot_encoder.pkl']

In [None]:
print("Shape of Train dataset:", X_train_final.shape)
X_train_final.head()

Shape of Train dataset: (1147498, 88)


Unnamed: 0,s_resi,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_resi,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y,s_resn_A,s_resn_C,s_resn_D,s_resn_E,s_resn_F,s_resn_G,s_resn_H,s_resn_I,s_resn_K,s_resn_L,s_resn_M,s_resn_N,s_resn_P,s_resn_Q,s_resn_R,s_resn_S,s_resn_T,s_resn_V,s_resn_W,s_resn_Y,t_resn_A,t_resn_C,t_resn_D,t_resn_E,t_resn_F,t_resn_G,t_resn_H,t_resn_I,t_resn_K,t_resn_L,t_resn_M,t_resn_N,t_resn_P,t_resn_Q,t_resn_R,t_resn_S,t_resn_T,t_resn_V,t_resn_W,t_resn_Y,s_ss8_-,s_ss8_B,s_ss8_E,s_ss8_G,s_ss8_H,s_ss8_I,s_ss8_P,s_ss8_S,s_ss8_T,t_ss8_-,t_ss8_B,t_ss8_E,t_ss8_G,t_ss8_H,t_ss8_I,t_ss8_P,t_ss8_S,t_ss8_T
0,121.0,0.0,-1.203,-0.763,-0.591,-1.302,-0.733,1.57,-0.146,124.0,0.128,-1.56,-0.151,-0.663,-1.524,2.219,-1.005,1.212,1.0,0.128,0.072,0.222,2.952,2.575,1.358,5.185897,2.305979,-1.498816,2.149514,-0.802992,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,201.0,0.142,-1.109,-0.835,-1.006,-0.59,1.891,-0.397,0.412,204.0,0.646,-1.137,-0.508,-0.228,1.399,-4.76,0.67,-2.647,1.0,0.504,0.778,1.989,6.651,1.067,3.059,5.103776,2.305979,-1.498816,0.778631,-2.165999,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,190.0,0.0,-1.222,-0.397,0.26,0.83,3.097,-0.838,1.512,194.0,0.081,-2.08,0.042,0.26,0.83,3.097,-0.838,1.512,1.0,0.081,0.0,0.0,0.0,0.0,0.0,5.500035,-0.135632,-1.891373,-0.135632,-1.891373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,94.0,0.0,-0.986,-0.806,1.357,-1.453,1.477,0.113,-0.837,105.0,0.012,-2.537,1.32,1.05,0.302,-3.656,-0.259,-3.242,1.0,0.012,0.307,1.755,5.133,0.372,2.405,8.81394,-1.106118,-1.339661,-1.106118,-1.339661,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,69.0,0.066,-1.051,-0.818,-0.591,-1.302,-0.733,1.57,-0.146,73.0,0.248,-1.136,-0.707,0.945,0.828,1.299,-0.169,0.933,1.0,0.182,1.536,2.13,2.032,1.739,1.079,5.717558,-1.106118,-1.339661,-1.106118,-1.339661,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


Resulting dataframe statistics are assesed:

In [None]:
X_train_final.describe(include='all')

Unnamed: 0,s_resi,s_rsa,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_resi,t_rsa,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5,same_chain,delta_rsa,delta_atchley_1,delta_atchley_2,delta_atchley_3,delta_atchley_4,delta_atchley_5,ca_distance,s_centroid_x,s_centroid_y,t_centroid_x,t_centroid_y,s_resn_A,s_resn_C,s_resn_D,s_resn_E,s_resn_F,s_resn_G,s_resn_H,s_resn_I,s_resn_K,s_resn_L,s_resn_M,s_resn_N,s_resn_P,s_resn_Q,s_resn_R,s_resn_S,s_resn_T,s_resn_V,s_resn_W,s_resn_Y,t_resn_A,t_resn_C,t_resn_D,t_resn_E,t_resn_F,t_resn_G,t_resn_H,t_resn_I,t_resn_K,t_resn_L,t_resn_M,t_resn_N,t_resn_P,t_resn_Q,t_resn_R,t_resn_S,t_resn_T,t_resn_V,t_resn_W,t_resn_Y,s_ss8_-,s_ss8_B,s_ss8_E,s_ss8_G,s_ss8_H,s_ss8_I,s_ss8_P,s_ss8_S,s_ss8_T,t_ss8_-,t_ss8_B,t_ss8_E,t_ss8_G,t_ss8_H,t_ss8_I,t_ss8_P,t_ss8_S,t_ss8_T
count,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0,1147498.0
mean,198.1975,0.1711245,-1.394678,0.5722613,0.1324133,-0.1035014,0.09314794,0.08973232,0.05455873,234.3407,0.1852072,-1.378113,0.5601196,0.1550668,-0.1440923,0.1772425,0.08539873,0.1698009,0.9571274,0.152249,0.9773058,0.9549016,2.223847,0.8877055,1.959536,6.608649,0.1256237,-0.9965846,0.785277,-0.9362168,0.05941013,0.02217956,0.07144326,0.07397311,0.07333085,0.04043319,0.04249419,0.04327502,0.06451427,0.06823716,0.01587802,0.03852556,0.02339873,0.0311896,0.08358359,0.04592339,0.04185454,0.05074954,0.03844625,0.07116004,0.06324107,0.02171943,0.06010729,0.07274871,0.07174653,0.04560444,0.04565672,0.04173776,0.07181973,0.0696925,0.01700918,0.03794691,0.01052028,0.03545627,0.09579537,0.04078351,0.03905715,0.04862841,0.03921575,0.07151298,0.1462887,0.01115296,0.2345965,0.03374821,0.4370971,0.006832256,0.01409501,0.05890293,0.05728637,0.124585,0.01039217,0.2305468,0.03581096,0.4297001,0.008243152,0.0118876,0.06343715,0.0853971
std,221.1298,0.1917489,0.7225683,1.529876,1.02683,0.8949891,2.133928,0.9127528,1.657422,226.0629,0.2020374,0.7672229,1.471443,1.042264,0.8649798,2.074001,0.9248208,1.640239,0.1975345,0.150693,0.7997462,0.7533611,1.868336,0.7391516,1.550439,1.694625,1.380263,1.023583,1.39448,1.063133,0.2363908,0.1472673,0.2575639,0.2617272,0.2606789,0.196973,0.2017138,0.2034756,0.2456669,0.2521525,0.1250037,0.1924613,0.1511663,0.1738299,0.2767624,0.2093191,0.2002568,0.2194859,0.1922711,0.2570921,0.2433962,0.145766,0.2376856,0.2597237,0.2580679,0.2086258,0.2087396,0.1999894,0.2581893,0.2546282,0.1293054,0.191068,0.1020275,0.1849301,0.2943105,0.1977884,0.1937311,0.21509,0.194108,0.2576799,0.3533956,0.1050171,0.4237465,0.1805804,0.4960277,0.08237464,0.1178828,0.235443,0.2323891,0.3302479,0.101411,0.421183,0.1858186,0.4950335,0.09041686,0.1083803,0.2437477,0.2794718
min,-24.0,0.0,-3.141,-3.142,-1.343,-1.524,-4.76,-2.128,-3.242,-27.0,0.0,-3.141,-3.142,-1.343,-1.524,-4.76,-2.128,-3.242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.571041,-2.881375,-2.165999,-2.881375,-2.165999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,64.0,0.012,-1.841,-0.723,-0.7850346,-0.59,-1.505,-0.397,-0.912,96.0,0.015,-1.837,-0.705,-0.6717497,-0.59,-1.505,-0.397,-0.912,1.0,0.03,0.3338908,0.368,0.736,0.327,0.7099988,5.415731,-1.106118,-2.006822,-0.1356317,-2.006822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,139.0,0.096,-1.225525,-0.247,-0.032,-0.179,0.672,0.113,-0.078,179.0,0.111,-1.265,-0.1603372,-0.032,-0.2545879,0.7468615,0.113,-0.00619081,1.0,0.106,0.781,0.863,1.874,0.699,1.685,6.126561,0.4948263,-1.303724,0.778631,-1.255422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,257.0,0.283,-1.081,2.261,1.05,0.326,1.502,0.908,1.313,301.0,0.3029953,-1.090104,2.241,1.05,0.326,1.502,0.908,1.512,1.0,0.2328783,1.522,1.389,3.564,1.305,2.97052,7.254723,0.778631,-0.3599843,2.139425,-0.1502629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,3192.0,1.0,3.141,3.142,1.831,2.081,3.097,1.57,2.897,3197.0,1.0,3.141,3.142,1.831,2.081,3.097,1.57,2.897,1.0,1.0,3.174,3.605,7.857,3.698,6.139,16.95366,3.202521,1.735641,3.202521,1.735641,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Saving

In [None]:
save_df_as_tsv(pd.concat([X_train_final, y_train_final], axis=1), tsv_path, 'train_set_new_smote')

Saved TSV to: /content/drive/MyDrive/Structural Bioinfo PROJECT/datasets/train_set_new_smote.tsv


# Test set

## MultiLabel Binarizer

In [None]:
grouped_test_df = test_df.groupby(feature_cols, sort=False, observed=True)['Interaction'].apply(set).reset_index()

# Reconstruct X and y
# The feature columns are all columns except the last one ('Interaction') after reset_index
X_test_MLB = grouped_test_df.iloc[:, :-1]
y_test_grouped = grouped_test_df.iloc[:, -1].tolist()

# Apply MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_test_MLB = mlb.fit_transform(y_test_grouped)

In [None]:
test_df.shape

(351816, 40)

In [None]:
grouped_test_df.shape

(272070, 40)

In [None]:
print("Observations merged: ", test_df.shape[0] - grouped_test_df.shape[0])

Observations merged:  79746


In [None]:
# Assign column names to target
y_test_final = pd.DataFrame(y_test_MLB, columns=mlb.classes_)
y_test_final.head()

Unnamed: 0,HBOND,IONIC,PICATION,PIHBOND,PIPISTACK,SSBOND,VDW
0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,1


In [None]:
X_test_MLB.drop(columns=['pdb_id', 's_ch', 't_ch', 's_ins', 't_ins'], inplace=True)

## One-Hot Encoding

In [None]:
# Fit only on training
X_test_OHE = encoder.transform(X_test_MLB[categorical_cols])

# Drop original and append encoded features
X_test_final = X_test_MLB.drop(columns=categorical_cols).reset_index(drop=True)

X_test_final = pd.concat([
    X_test_final,
    pd.DataFrame(X_test_OHE, columns=encoded_cols)
], axis=1)

In [None]:
print("Shape of Test dataset:", X_test_final.shape)

Shape of Test dataset: (272070, 88)


## Saving

In [None]:
save_df_as_tsv(pd.concat([X_test_final, y_test_final], axis=1), tsv_path, 'test_set_new')


Saved TSV to: /content/drive/MyDrive/Structural Bioinfo PROJECT/datasets/test_set_new.tsv


# Validation set

## MultiLabel Binarizer

In [None]:
grouped_val_df = val_df.groupby(feature_cols, sort=False, observed=True)['Interaction'].apply(set).reset_index()

# Reconstruct X and y
# The feature columns are all columns except the last one ('Interaction') after reset_index
X_val_MLB = grouped_val_df.iloc[:, :-1]
y_val_grouped = grouped_val_df.iloc[:, -1].tolist()

# Apply MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_val_MLB = mlb.fit_transform(y_val_grouped)

In [None]:
val_df.shape

(292102, 40)

In [None]:
grouped_val_df.shape

(225669, 40)

In [None]:
print("Observations merged: ", val_df.shape[0] - grouped_val_df.shape[0])

Observations merged:  66433


In [None]:
# Assign column names to target
y_val_final = pd.DataFrame(y_val_MLB, columns=mlb.classes_)
y_val_final.head()

Unnamed: 0,HBOND,IONIC,PICATION,PIHBOND,PIPISTACK,SSBOND,VDW
0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0


In [None]:
X_val_MLB.drop(columns=['pdb_id', 's_ch', 't_ch', 's_ins', 't_ins'], inplace=True)

## One-Hot Encoding

In [None]:
# Fit only on training
X_val_OHE = encoder.transform(X_val_MLB[categorical_cols])

# Drop original and append encoded features
X_val_final = X_val_MLB.drop(columns=categorical_cols).reset_index(drop=True)

X_val_final = pd.concat([
    X_val_final,
    pd.DataFrame(X_val_OHE, columns=encoded_cols)
], axis=1)

In [None]:
print("Shape of Validation dataset:", X_val_final.shape)

Shape of Validation dataset: (225669, 88)


## Saving

In [None]:
save_df_as_tsv(pd.concat([X_val_final, y_val_final], axis=1), tsv_path, 'val_set_new')


Saved TSV to: /content/drive/MyDrive/Structural Bioinfo PROJECT/datasets/val_set_new.tsv
