In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#**Data Preporecessing**

In [51]:
data = pd.read_excel("/content/biodeg.xlsx")

In [52]:
data.isnull().sum()

Unnamed: 0,0
SpMax_L,0
nHM,0
F01[N-N],0
nCb,0
C%,0
nO,0
F03[C-N],0
F03[C-O],0
Me,0
Mi,0


In [53]:
# Log transformation for count-based features
count_features = ["nHM", "nO", "nN", "nArNO2", "nCIR"]
for col in count_features:
    data[col] = np.log1p(data[col])  # ln(x+1)

In [5]:
data

Unnamed: 0,SpMax_L,nHM,F01[N-N],nCb,C%,nO,F03[C-N],F03[C-O],Me,Mi,nArNO2,nCIR,B01[C-Br],B03[C-Cl],SpMax_A,nHDon,nN,nArCOOR,nX,Class
0,3.919,0.000000,0,0,31.4,0.000000,0,0,0.960,1.142,0.000000,0.000000,0,0,1.932,0,0.000000,0,0,NRB
1,4.170,0.000000,0,0,30.8,0.693147,0,1,0.989,1.144,0.000000,0.693147,0,0,2.214,0,0.000000,0,0,NRB
2,3.932,0.000000,0,0,26.7,1.609438,0,0,1.009,1.152,0.000000,0.000000,0,0,1.942,1,0.000000,0,0,NRB
3,3.000,0.000000,0,0,20.0,1.098612,0,0,1.108,1.167,0.000000,0.000000,0,0,1.414,1,0.000000,0,0,NRB
4,4.236,0.000000,0,0,29.4,1.609438,0,2,1.004,1.147,0.000000,0.000000,0,0,1.985,0,0.000000,0,0,NRB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,5.431,0.000000,0,0,32.1,0.693147,1,2,0.982,1.144,0.000000,0.693147,0,0,2.394,1,0.693147,0,0,RB
1051,5.287,0.000000,0,0,35.3,2.302585,9,21,1.043,1.140,0.000000,0.693147,0,0,2.462,0,1.386294,0,0,RB
1052,4.869,0.000000,1,5,44.4,1.609438,14,9,1.016,1.123,0.693147,1.386294,0,0,2.314,0,1.791759,1,0,RB
1053,5.158,1.098612,0,9,56.1,0.000000,44,0,1.007,1.093,0.000000,4.997212,0,1,2.622,0,2.197225,0,1,RB


#**Feature Engineering**

In [54]:
# N_to_O_Ratio = nN / (nO + 1)
data["N_to_O_Ratio"] = data["nN"] / (data["nO"] + 1)

# Halo_Density = (B01[C-Br] + B03[C-Cl]) / (nHM + 1)
data["Halo_Density"] = (data["B01[C-Br]"] + data["B03[C-Cl]"]) / (data["nHM"] + 1)

# Index_Interaction = SpMax_L * (nN + nO + nArNO2)
data["Index_Interaction"] = data["SpMax_L"] * (data["nN"] + data["nO"] + data["nArNO2"])


In [55]:
data

Unnamed: 0,SpMax_L,nHM,F01[N-N],nCb,C%,nO,F03[C-N],F03[C-O],Me,Mi,...,B03[C-Cl],SpMax_A,nHDon,nN,nArCOOR,nX,Class,N_to_O_Ratio,Halo_Density,Index_Interaction
0,3.919,0.000000,0,0,31.4,0.000000,0,0,0.960,1.142,...,0,1.932,0,0.000000,0,0,NRB,0.000000,0.000000,0.000000
1,4.170,0.000000,0,0,30.8,0.693147,0,1,0.989,1.144,...,0,2.214,0,0.000000,0,0,NRB,0.000000,0.000000,2.890424
2,3.932,0.000000,0,0,26.7,1.609438,0,0,1.009,1.152,...,0,1.942,1,0.000000,0,0,NRB,0.000000,0.000000,6.328310
3,3.000,0.000000,0,0,20.0,1.098612,0,0,1.108,1.167,...,0,1.414,1,0.000000,0,0,NRB,0.000000,0.000000,3.295837
4,4.236,0.000000,0,0,29.4,1.609438,0,2,1.004,1.147,...,0,1.985,0,0.000000,0,0,NRB,0.000000,0.000000,6.817579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,5.431,0.000000,0,0,32.1,0.693147,1,2,0.982,1.144,...,0,2.394,1,0.693147,0,0,RB,0.409384,0.000000,7.528965
1051,5.287,0.000000,0,0,35.3,2.302585,9,21,1.043,1.140,...,0,2.462,0,1.386294,0,0,RB,0.419760,0.000000,19.503106
1052,4.869,0.000000,1,5,44.4,1.609438,14,9,1.016,1.123,...,0,2.314,0,1.791759,1,0,RB,0.686646,0.000000,19.935364
1053,5.158,1.098612,0,9,56.1,0.000000,44,0,1.007,1.093,...,1,2.622,0,2.197225,0,1,RB,2.197225,0.476505,11.333284


In [56]:
# Encode the target column because it is categorical
# RB -> 1, NRB -> 0
data["Class"] = data["Class"].map({"RB": 1, "NRB": 0})

# Separate features and target
X = data.drop(columns=["Class"])
y = data["Class"]


In [57]:
y

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0
...,...
1050,1
1051,1
1052,1
1053,1


In [88]:
# correlation matrix
corr_matrix = X.corr()

# find the highly correlated pairs (>0.9) Because they can cause overfitting
high_corr_pairs = []
threshold = 0.9
for col in corr_matrix.columns:
    for idx in corr_matrix.index:
        if col != idx and abs(corr_matrix.loc[idx, col]) > threshold:
            high_corr_pairs.append((idx, col, corr_matrix.loc[idx, col]))

# make couples unique
unique_pairs = []
seen = set()
for a, b, corr in high_corr_pairs:
    if (b, a) not in seen:
        unique_pairs.append((a, b, corr))
        seen.add((a, b))

# define the features that should be drop
features_to_drop = list({b for _, b, _ in unique_pairs})

# create a new cleaned dataset
df = data.drop(columns=features_to_drop)

# show the results
print("highly correlated pairs (>|0.9|):")
for pair in unique_pairs:
    print(pair)

print("\ndropped features:", features_to_drop)
print("\noriginal data size:", data.shape)
print("\ncleaned data size:", df.shape)

highly correlated pairs (>|0.9|):
('SpMax_A', 'SpMax_L', np.float64(0.9189280075472835))

dropped features: ['SpMax_L']

original data size: (1055, 23)

cleaned data size: (1055, 22)


In [89]:
from sklearn.preprocessing import LabelEncoder
X = df.drop(columns=["Class"])
y = df["Class"]

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [90]:
# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [222]:
# Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [223]:
X_train.shape

(844, 21)

In [224]:
X_test.shape

(211, 21)

In [14]:
!pip install numpy catboost xgboost lightgbm plotly

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [159]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import plotly.graph_objects as go
import plotly.express as px

In [185]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [186]:
rf_param_grid = {
    "n_estimators": [200, 300],
    "max_depth": [5, 7],
    "min_samples_leaf": [10, 15]
}

cat_param_grid = {
    "iterations": [300, 500],
    "depth": [3, 4],
    "learning_rate": [0.01, 0.03],
    "l2_leaf_reg": [20, 50]
}

lgbm_param_grid = {
    "n_estimators": [150, 200],
    "max_depth": [4, 5],
    "learning_rate": [0.02, 0.03],
    "num_leaves": [31, 63],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "reg_lambda": [10, 20],
    "reg_alpha": [5, 10]
}

xgb_param_grid = {
    "n_estimators": [150, 200],
    "max_depth": [4, 5],
    "learning_rate": [0.02, 0.03],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "reg_lambda": [10, 20],
    "reg_alpha": [5, 10]
}

In [187]:
# defining of models
rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight="balanced")

cat = CatBoostClassifier(random_state=42, loss_function="Logloss", eval_metric="Accuracy", verbose=False)

lgbm = LGBMClassifier(random_state=42, objective="binary", n_jobs=-1)

xgb = XGBClassifier(random_state=42, objective="binary:logistic", eval_metric="logloss", n_jobs=-1)

In [188]:
rf_grid = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=1)

cat_grid = GridSearchCV(estimator=cat, param_grid=cat_param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=1)

lgbm_grid = GridSearchCV(estimator=lgbm, param_grid=lgbm_param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=1)

xgb_grid = GridSearchCV(estimator=xgb, param_grid=xgb_param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=1)

In [189]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [190]:
rf_grid.best_params_

{'max_depth': 7, 'min_samples_leaf': 10, 'n_estimators': 300}

In [191]:
rf_grid.best_score_

np.float64(0.8341222879684418)

In [192]:
import xgboost
from xgboost.callback import EarlyStopping

In [217]:
xgb_grid.fit(X_train, y_train, eval_set=[(X_test, y_test)])

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[0]	validation_0-logloss:0.63189
[1]	validation_0-logloss:0.62563
[2]	validation_0-logloss:0.61894
[3]	validation_0-logloss:0.61258
[4]	validation_0-logloss:0.60679
[5]	validation_0-logloss:0.60062
[6]	validation_0-logloss:0.59465
[7]	validation_0-logloss:0.58860
[8]	validation_0-logloss:0.58318
[9]	validation_0-logloss:0.57919
[10]	validation_0-logloss:0.57432
[11]	validation_0-logloss:0.57039
[12]	validation_0-logloss:0.56696
[13]	validation_0-logloss:0.56260
[14]	validation_0-logloss:0.55922
[15]	validation_0-logloss:0.55539
[16]	validation_0-logloss:0.55250
[17]	validation_0-logloss:0.54913
[18]	validation_0-logloss:0.54511
[19]	validation_0-logloss:0.54102
[20]	validation_0-logloss:0.53717
[21]	validation_0-logloss:0.53418
[22]	validation_0-logloss:0.53057
[23]	validation_0-logloss:0.52759
[24]	validation_0-logloss:0.52410
[25]	validation_0-logloss:0.52114
[26]	validation_0-logloss:0.51855
[27]	validation_0-logloss:0.51

In [218]:
xgb_grid.best_params_

{'colsample_bytree': 0.8,
 'learning_rate': 0.03,
 'max_depth': 5,
 'n_estimators': 200,
 'reg_alpha': 5,
 'reg_lambda': 10,
 'subsample': 0.8}

In [219]:
xgb_grid.best_score_

np.float64(0.8483727810650887)

In [200]:
cat_grid.fit(X_train, y_train, eval_set=[(X_test, y_test)])

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [202]:
cat_grid.best_params_

{'depth': 4, 'iterations': 500, 'l2_leaf_reg': 20, 'learning_rate': 0.03}

In [201]:
cat_grid.best_score_

np.float64(0.8637785291631447)

In [205]:
lgbm_grid.fit(X_train, y_train, eval_set=[(X_test, y_test)])

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[LightGBM] [Info] Number of positive: 559, number of negative: 285
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 951
[LightGBM] [Info] Number of data points in the train set: 844, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.662322 -> initscore=0.673660
[LightGBM] [Info] Start training from score 0.673660


In [206]:
lgbm_grid.best_params_

{'colsample_bytree': 0.8,
 'learning_rate': 0.03,
 'max_depth': 5,
 'n_estimators': 200,
 'num_leaves': 31,
 'reg_alpha': 5,
 'reg_lambda': 10,
 'subsample': 0.8}

In [207]:
lgbm_grid.best_score_

np.float64(0.8400605804451958)

In [208]:
grids = {"RandomForest": rf_grid, "XGBoost": xgb_grid, "CatBoost": cat_grid, "LightGBM": lgbm_grid}

In [209]:
rows = []
for name, grid in grids.items():
    best = grid.best_estimator_
    acc_train = accuracy_score(y_train, best.predict(X_train))
    acc_test = accuracy_score(y_test, best.predict(X_test))
    rows.append({"Model": name, "Train_Accuracy": acc_train, "Test_Accuracy": acc_test})


X does not have valid feature names, but LGBMClassifier was fitted with feature names


X does not have valid feature names, but LGBMClassifier was fitted with feature names



In [210]:
acc_df = pd.DataFrame(rows).set_index("Model").sort_values("Test_Accuracy", ascending=False)
acc_df

Unnamed: 0_level_0,Train_Accuracy,Test_Accuracy
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
CatBoost,0.907583,0.834123
XGBoost,0.88981,0.829384
LightGBM,0.887441,0.824645
RandomForest,0.873223,0.796209


The final results show that all four models achieved a good balance between training and testing performance after applying regularization and tuning. CatBoost delivered the highest overall accuracy, with a training score of about 90.8% and a test score of 83.4%, indicating strong generalization and minimal overfitting. XGBoost and LightGBM followed closely, both reaching training accuracies around 88–89% and test accuracies near 82–83%. These values suggest that the adjustments to learning rate, depth, and regularization successfully reduced the large gaps observed in earlier experiments, where training accuracy was extremely high and test accuracy significantly lower. RandomForest performed slightly worse, with a test accuracy of about 79.6%, which is expected given its simpler structure compared to boosting algorithms. Overall, the models now exhibit a healthy bias-variance trade-off, and the results confirm that boosting methods, particularly CatBoost, are more effective for this dataset when properly tuned.

In [211]:
# Confusion Matrix (Plotly)
def plot_confusion_matrix(cm, labels=("NRB", "RB"), title="Confusion Matrix"):
    fig = go.Figure(data=go.Heatmap(
        z=cm,
        x=labels,
        y=labels,
        colorscale="Blues",
        text=cm,
        texttemplate="%{text}"
    ))
    fig.update_layout(title=title, xaxis_title="Predicted", yaxis_title="True")
    fig.show()

In [212]:
for name, grid in grids.items():
    best = grid.best_estimator_
    y_test_pred = best.predict(X_test)
    cm = confusion_matrix(y_test, y_test_pred)
    plot_confusion_matrix(cm, title=f"{name} - Confusion Matrix")


X does not have valid feature names, but LGBMClassifier was fitted with feature names



In [225]:
pred_cols = {"True": y_test.values}
for name, grid in grids.items():
    pred_cols[name] = grid.best_estimator_.predict(X_test)


X does not have valid feature names, but LGBMClassifier was fitted with feature names



In [226]:
pred_df = pd.DataFrame(pred_cols, index=np.arange(len(y_test)))  # indeksler test gözlemleri sıra numarası
pred_df.head(30)

Unnamed: 0,True,RandomForest,XGBoost,CatBoost,LightGBM
0,0,1,1,1,1
1,0,0,0,0,0
2,1,1,1,1,1
3,1,1,1,1,1
4,0,0,0,0,0
5,1,1,1,1,1
6,1,1,1,1,1
7,1,1,1,1,1
8,0,0,0,0,0
9,0,1,1,1,1


In [228]:
# 6) the best tree models bar plot
top3 = acc_df.head(3).reset_index()
fig = px.bar(top3, x="Model", y="Test_Accuracy", text="Test_Accuracy", title="The Best 3 Models (Test Accuracy)")
fig.update_traces(texttemplate="%{text:.3f}", textposition="outside")
fig.update_yaxes(range=[0,1])
fig.show()

In [229]:
#hyperparameter
best_params_df = pd.DataFrame({
    "Model": list(grids.keys()),
    "Best_Params": [g.best_params_ for g in grids.values()]
})
best_params_df


Unnamed: 0,Model,Best_Params
0,RandomForest,"{'max_depth': 7, 'min_samples_leaf': 10, 'n_es..."
1,XGBoost,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
2,CatBoost,"{'depth': 4, 'iterations': 500, 'l2_leaf_reg':..."
3,LightGBM,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."


#**DASHBOARD**

In [230]:
!pip install jupyter-dash==0.4.2 dash==2.17.1 plotly==5.24.1

Collecting jupyter-dash==0.4.2
  Downloading jupyter_dash-0.4.2-py3-none-any.whl.metadata (3.6 kB)
Collecting dash==2.17.1
  Downloading dash-2.17.1-py3-none-any.whl.metadata (10 kB)
Collecting retrying (from jupyter-dash==0.4.2)
  Downloading retrying-1.4.2-py3-none-any.whl.metadata (5.5 kB)
Collecting ansi2html (from jupyter-dash==0.4.2)
  Downloading ansi2html-1.9.2-py3-none-any.whl.metadata (3.7 kB)
Collecting flask (from jupyter-dash==0.4.2)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.1 (from dash==2.17.1)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting dash-html-components==2.0.0 (from dash==2.17.1)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash==2.17.1)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash==2.17.1)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4

In [231]:
from jupyter_dash import JupyterDash
from dash import Dash, dcc, html, Input, Output, State
import plotly.express as px

In [232]:
model_options = ["LightGBM", "CatBoost", "XGBoost"]

In [233]:
# start
app = JupyterDash(__name__)


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



In [234]:
app.layout = html.Div([
    html.H1("Classification Dashboard - QSAR Biodegradation", style={"textAlign": "center"}),
    html.Div("Fehime Capar | 605437"),

    html.Div([
        html.Label("Select Models to Compare:"),
        dcc.Dropdown(
            id="model-select",
            options=[{"label": m, "value": m} for m in model_options],
            multi=True,
            value=["LightGBM", "CatBoost"]  # default
        )
    ], style={"width": "50%", "margin": "auto"}),

    html.Br(),

    dcc.Tabs(id="tabs", value="tab1", children=[
        dcc.Tab(label="Page 1: Data Overview", value="tab1"),
        dcc.Tab(label="Page 2: Performance Metrics", value="tab2"),
        dcc.Tab(label="Page 3: Forecasting Results", value="tab3"),
        dcc.Tab(label="Page 4: Extra Info", value="tab4")
    ]),

    html.Div(id="tab-content", style={"padding": "20px"})
])

In [235]:
# --- Callback ---
@app.callback(
    Output("tab-content", "children"),
    Input("tabs", "value"),
    Input("model-select", "value")
)
def update_tab(tab, selected_models):
    if tab == "tab1":
        # Page 1: Data Overview
        return html.Div([
            html.H3("Dataset Overview"),
            html.P(f"Rows: {X_train.shape[0] + X_test.shape[0]}, Columns: {X_train.shape[1]}"),
            html.P("Target: Class (RB=1, NRB=0)"),
            html.H4("Feature Names:"),
            html.Ul([html.Li(col) for col in data.drop(columns=['Class']).columns])
        ])

    elif tab == "tab2":
        # Page 2: Performance Metrics
        filtered_acc = acc_df.loc[selected_models]
        fig_bar = px.bar(filtered_acc.reset_index(), x="Model", y="Test_Accuracy",
                         title="Test Accuracy of Selected Models", text="Test_Accuracy")
        fig_bar.update_traces(texttemplate="%{text:.3f}", textposition="outside")
        return html.Div([
            html.H3("Performance Metrics"),
            html.Div([
                html.H4("Accuracy Table"),
                html.Table([
                    html.Tr([html.Th(col) for col in ["Model", "Train_Accuracy", "Test_Accuracy"]])] +
                    [html.Tr([html.Td(m), html.Td(f"{filtered_acc.loc[m,'Train_Accuracy']:.3f}"),
                              html.Td(f"{filtered_acc.loc[m,'Test_Accuracy']:.3f}")]) for m in selected_models]
                ),
            ]),
            html.Br(),
            dcc.Graph(figure=fig_bar)
        ])

    elif tab == "tab3":
        # Page 3: Forecasting Results
        # Yanlış sınıflamaları bul
        df = pred_df.copy()
        df["Wrong_Count"] = sum(df[m] != df["True"] for m in selected_models)
        df_sorted = df.sort_values("Wrong_Count", ascending=False).head(20)
        return html.Div([
            html.H3("Forecasting Results (Top 20 Wrong Predictions)"),
            html.P("Rows sorted by number of wrong classifications across selected models."),
            html.Pre(df_sorted.to_string())
        ])

    elif tab == "tab4":
        # Page 4: Extra Info (Hyperparameters)
        filtered_params = best_params_df[best_params_df["Model"].isin(selected_models)]
        return html.Div([
            html.H3("Best Hyperparameters"),
            html.Pre(filtered_params.to_string())
        ])

In [236]:
# --- Run app ---
app.run_server(mode="inline", port=8051)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>