In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [10]:
data = pd.read_excel("/content/Bio_classification.xlsx")

In [11]:
data.isnull().sum()

Unnamed: 0,0
SpMax_L,0
nHM,0
F01[N-N],0
nCb,0
C%,0
nO,0
F03[C-N],0
F03[C-O],0
Me,0
Mi,0


In [12]:
# Log transformation for count-based features
count_features = ["nHM", "nO", "nN", "nArNO2", "nCIR"]
for col in count_features:
    data[col] = np.log1p(data[col])  # ln(x+1)

In [13]:
data

Unnamed: 0,SpMax_L,nHM,F01[N-N],nCb,C%,nO,F03[C-N],F03[C-O],Me,Mi,nArNO2,nCIR,B01[C-Br],B03[C-Cl],SpMax_A,nHDon,nN,nArCOOR,nX,Class
0,3.919,0.000000,0,0,31.4,0.000000,0,0,0.960,1.142,0.000000,0.000000,0,0,1.932,0,0.000000,0,0,NRB
1,4.170,0.000000,0,0,30.8,0.693147,0,1,0.989,1.144,0.000000,0.693147,0,0,2.214,0,0.000000,0,0,NRB
2,3.932,0.000000,0,0,26.7,1.609438,0,0,1.009,1.152,0.000000,0.000000,0,0,1.942,1,0.000000,0,0,NRB
3,3.000,0.000000,0,0,20.0,1.098612,0,0,1.108,1.167,0.000000,0.000000,0,0,1.414,1,0.000000,0,0,NRB
4,4.236,0.000000,0,0,29.4,1.609438,0,2,1.004,1.147,0.000000,0.000000,0,0,1.985,0,0.000000,0,0,NRB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,5.431,0.000000,0,0,32.1,0.693147,1,2,0.982,1.144,0.000000,0.693147,0,0,2.394,1,0.693147,0,0,RB
1051,5.287,0.000000,0,0,35.3,2.302585,9,21,1.043,1.140,0.000000,0.693147,0,0,2.462,0,1.386294,0,0,RB
1052,4.869,0.000000,1,5,44.4,1.609438,14,9,1.016,1.123,0.693147,1.386294,0,0,2.314,0,1.791759,1,0,RB
1053,5.158,1.098612,0,9,56.1,0.000000,44,0,1.007,1.093,0.000000,4.997212,0,1,2.622,0,2.197225,0,1,RB


#**Feature Engineering**

In [14]:
# N_to_O_Ratio = nN / (nO + 1)
data["N_to_O_Ratio"] = data["nN"] / (data["nO"] + 1)

# Halo_Density = (B01[C-Br] + B03[C-Cl]) / (nHM + 1)
data["Halo_Density"] = (data["B01[C-Br]"] + data["B03[C-Cl]"]) / (data["nHM"] + 1)

# Index_Interaction = SpMax_L * (nN + nO + nArNO2)
data["Index_Interaction"] = data["SpMax_L"] * (data["nN"] + data["nO"] + data["nArNO2"])


In [15]:
data

Unnamed: 0,SpMax_L,nHM,F01[N-N],nCb,C%,nO,F03[C-N],F03[C-O],Me,Mi,...,B03[C-Cl],SpMax_A,nHDon,nN,nArCOOR,nX,Class,N_to_O_Ratio,Halo_Density,Index_Interaction
0,3.919,0.000000,0,0,31.4,0.000000,0,0,0.960,1.142,...,0,1.932,0,0.000000,0,0,NRB,0.000000,0.000000,0.000000
1,4.170,0.000000,0,0,30.8,0.693147,0,1,0.989,1.144,...,0,2.214,0,0.000000,0,0,NRB,0.000000,0.000000,2.890424
2,3.932,0.000000,0,0,26.7,1.609438,0,0,1.009,1.152,...,0,1.942,1,0.000000,0,0,NRB,0.000000,0.000000,6.328310
3,3.000,0.000000,0,0,20.0,1.098612,0,0,1.108,1.167,...,0,1.414,1,0.000000,0,0,NRB,0.000000,0.000000,3.295837
4,4.236,0.000000,0,0,29.4,1.609438,0,2,1.004,1.147,...,0,1.985,0,0.000000,0,0,NRB,0.000000,0.000000,6.817579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,5.431,0.000000,0,0,32.1,0.693147,1,2,0.982,1.144,...,0,2.394,1,0.693147,0,0,RB,0.409384,0.000000,7.528965
1051,5.287,0.000000,0,0,35.3,2.302585,9,21,1.043,1.140,...,0,2.462,0,1.386294,0,0,RB,0.419760,0.000000,19.503106
1052,4.869,0.000000,1,5,44.4,1.609438,14,9,1.016,1.123,...,0,2.314,0,1.791759,1,0,RB,0.686646,0.000000,19.935364
1053,5.158,1.098612,0,9,56.1,0.000000,44,0,1.007,1.093,...,1,2.622,0,2.197225,0,1,RB,2.197225,0.476505,11.333284


In [16]:
# Encode the target column because it is categorical
# RB -> 1, NRB -> 0
data["Class"] = data["Class"].map({"RB": 1, "NRB": 0})

# Separate features and target
X = data.drop(columns=["Class"])
y = data["Class"]


In [17]:
y

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
...,...
1050,1
1051,1
1052,1
1053,1


In [18]:
# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [19]:
# Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [20]:
X_train.shape

(844, 22)

In [21]:
X_test.shape

(211, 22)

In [27]:
!pip uninstall -y numpy catboost xgboost lightgbm

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Found existing installation: catboost 1.2.5
Uninstalling catboost-1.2.5:
  Successfully uninstalled catboost-1.2.5
Found existing installation: xgboost 1.7.6
Uninstalling xgboost-1.7.6:
  Successfully uninstalled xgboost-1.7.6
Found existing installation: lightgbm 4.3.0
Uninstalling lightgbm-4.3.0:
  Successfully uninstalled lightgbm-4.3.0


In [28]:
!pip install numpy catboost xgboost lightgbm plotly

Collecting numpy
  Downloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import plotly.graph_objects as go
import plotly.express as px