In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [4]:
df = pd.read_excel("/content/21_Fish_market_regression.xlsx")

In [7]:
# Check for missing values
df.isnull().sum()# Missing values can cause errors in model training. We check them first

Unnamed: 0,0
Species,0
Weight,0
Length1,0
Length2,0
Length3,0
Height,0
Width,0


#**Feature Engineering**

In [8]:
# Create Volume and Density
# Volume ≈ Length1 × Length2 × Length3
# Density ≈ Weight / Volume
df['Volume'] = df['Length1'] * df['Length2'] * df['Length3']
df['Density'] = df['Weight'] / df['Volume']


## Volume aggregates size info, Density accounts for species differences.


In [14]:
# Apply log transformation to Weight and size features
# Log transform helps normalize skewed distributions and linearize relationships.
for col in ['Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width', 'Volume']:
    df[f'log_{col}'] = np.log1p(df[col])  # log1p handles zero values safely - due to no error

df

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width,Volume,Density,log_Weight,log_Length1,log_Length2,log_Length3,log_Height,log_Width,log_Volume
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200,17678.400,0.013689,5.493061,3.186353,3.273364,3.433987,2.527327,1.613430,9.780155
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056,19693.440,0.014726,5.673323,3.218876,3.306887,3.471966,2.601207,1.668763,9.888092
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961,19697.185,0.017261,5.831882,3.214868,3.314186,3.468856,2.593597,1.739782,9.888282
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555,25550.450,0.014207,5.897154,3.306887,3.401197,3.540959,2.619583,1.696624,10.148449
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340,26129.000,0.016457,6.066108,3.314186,3.401197,3.555348,2.598533,1.813847,10.170839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936,1880.020,0.006489,2.580217,2.525729,2.580217,2.667228,1.128301,0.872799,7.539569
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690,1958.580,0.006842,2.667228,2.541602,2.595255,2.674149,1.232560,0.819339,7.580485
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558,2170.740,0.005620,2.580217,2.572612,2.639057,2.694627,1.186928,0.813505,7.683284
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672,2869.152,0.006866,3.030134,2.653242,2.727853,2.785011,1.353978,1.120765,7.962120


## Seperate the dataset into X and y

In [15]:
target = 'log_Weight'  # prediction of log-transformed Weight
features = [col for col in df.columns if col not in ['Species', 'Weight', target]]

X = df[features]
y = df[target]

The Species column did not consider because it is categorical. Firstly, we have to encode them, and after, we can add into X.

In [13]:
X

Unnamed: 0,Length1,Length2,Length3,Height,Width,Volume,Density,log_Length1,log_Length2,log_Length3,log_Height,log_Width,log_Volume
0,23.2,25.4,30.0,11.5200,4.0200,17678.400,0.013689,3.186353,3.273364,3.433987,2.527327,1.613430,9.780155
1,24.0,26.3,31.2,12.4800,4.3056,19693.440,0.014726,3.218876,3.306887,3.471966,2.601207,1.668763,9.888092
2,23.9,26.5,31.1,12.3778,4.6961,19697.185,0.017261,3.214868,3.314186,3.468856,2.593597,1.739782,9.888282
3,26.3,29.0,33.5,12.7300,4.4555,25550.450,0.014207,3.306887,3.401197,3.540959,2.619583,1.696624,10.148449
4,26.5,29.0,34.0,12.4440,5.1340,26129.000,0.016457,3.314186,3.401197,3.555348,2.598533,1.813847,10.170839
...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,11.5,12.2,13.4,2.0904,1.3936,1880.020,0.006489,2.525729,2.580217,2.667228,1.128301,0.872799,7.539569
155,11.7,12.4,13.5,2.4300,1.2690,1958.580,0.006842,2.541602,2.595255,2.674149,1.232560,0.819339,7.580485
156,12.1,13.0,13.8,2.2770,1.2558,2170.740,0.005620,2.572612,2.639057,2.694627,1.186928,0.813505,7.683284
157,13.2,14.3,15.2,2.8728,2.0672,2869.152,0.006866,2.653242,2.727853,2.785011,1.353978,1.120765,7.962120


In [16]:
y

Unnamed: 0,log_Weight
0,5.493061
1,5.673323
2,5.831882
3,5.897154
4,6.066108
...,...
154,2.580217
155,2.667228
156,2.580217
157,3.030134


In [17]:
# Encode Species using OneHotEncoder (fix: use sparse_output=False)
encoder = OneHotEncoder(sparse_output=False)
species_encoded = encoder.fit_transform(df[['Species']])
species_cols = encoder.get_feature_names_out(['Species'])


In [19]:
# Add encoded species to X
X_encoded = pd.concat([pd.DataFrame(species_encoded, columns=species_cols), X.reset_index(drop=True)], axis=1)
X_encoded

Unnamed: 0,Species_Bream,Species_Parkki,Species_Perch,Species_Pike,Species_Roach,Species_Smelt,Species_Whitefish,Length1,Length2,Length3,Height,Width,Volume,Density,log_Length1,log_Length2,log_Length3,log_Height,log_Width,log_Volume
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,25.4,30.0,11.5200,4.0200,17678.400,0.013689,3.186353,3.273364,3.433987,2.527327,1.613430,9.780155
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,26.3,31.2,12.4800,4.3056,19693.440,0.014726,3.218876,3.306887,3.471966,2.601207,1.668763,9.888092
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,23.9,26.5,31.1,12.3778,4.6961,19697.185,0.017261,3.214868,3.314186,3.468856,2.593597,1.739782,9.888282
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,26.3,29.0,33.5,12.7300,4.4555,25550.450,0.014207,3.306887,3.401197,3.540959,2.619583,1.696624,10.148449
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,26.5,29.0,34.0,12.4440,5.1340,26129.000,0.016457,3.314186,3.401197,3.555348,2.598533,1.813847,10.170839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.5,12.2,13.4,2.0904,1.3936,1880.020,0.006489,2.525729,2.580217,2.667228,1.128301,0.872799,7.539569
155,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.7,12.4,13.5,2.4300,1.2690,1958.580,0.006842,2.541602,2.595255,2.674149,1.232560,0.819339,7.580485
156,0.0,0.0,0.0,0.0,0.0,1.0,0.0,12.1,13.0,13.8,2.2770,1.2558,2170.740,0.005620,2.572612,2.639057,2.694627,1.186928,0.813505,7.683284
157,0.0,0.0,0.0,0.0,0.0,1.0,0.0,13.2,14.3,15.2,2.8728,2.0672,2869.152,0.006866,2.653242,2.727853,2.785011,1.353978,1.120765,7.962120


In [21]:
# Scale numerical features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

In [22]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [23]:
X_train.shape

(127, 20)

In [24]:
X_test.shape

(32, 20)

#**Random Forrest**

In [27]:
# Random Forest with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

rf = RandomForestRegressor(random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=cv,
                           scoring='r2',
                           n_jobs=-1)

In [29]:
# Fit model
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}

In [30]:
grid_search.best_score_

np.float64(0.9259853597803206)

approximatly 0.925 is quite good score. The model can pretty good understand the data

In [31]:
# Predict on test set
y_pred = grid_search.best_estimator_.predict(X_test)

In [32]:
# Create DataFrame with true vs predicted
results_df = pd.DataFrame({
    'True Values': y_test.values,
    'RF Predictions': y_pred
})

In [34]:
results_df

Unnamed: 0,True Values,RF Predictions
0,4.369448,4.420988
1,2.667228,2.520085
2,5.303305,5.394687
3,5.602119,5.633193
4,5.01728,4.930921
5,6.908755,6.75328
6,2.079442,2.22408
7,5.198497,5.374215
8,5.241747,5.273262
9,7.131699,7.226163


#**XGBOOST**

In [35]:
from xgboost import XGBRegressor

In [39]:
xg_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

xg = XGBRegressor(random_state=42, objective="reg:squarederror")
xg_cv = KFold(n_splits=5, shuffle=True, random_state=42)

xg_grid_search = GridSearchCV(estimator=xg,
                           param_grid=xg_param_grid,
                           cv=xg_cv,
                           scoring='r2',
                           n_jobs=-1)

In [40]:
# Fit model
xg_grid_search.fit(X_train, y_train)
xg_grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

In [41]:
xg_grid_search.best_score_

np.float64(0.9223200195764949)

0.922 is quite good result. The model can pretty good understand the data.(Same with Random Forrest)

In [42]:
# Predict on test set
y_pred_xg = xg_grid_search.best_estimator_.predict(X_test)

In [43]:
# Create DataFrame with true vs predicted
xg_results_df = pd.DataFrame({
    'True Values': y_test.values,
    'RF Predictions': y_pred_xg
})

In [44]:
xg_results_df

Unnamed: 0,True Values,RF Predictions
0,4.369448,4.408215
1,2.667228,2.55428
2,5.303305,5.661238
3,5.602119,5.676934
4,5.01728,4.924502
5,6.908755,6.778795
6,2.079442,2.178085
7,5.198497,5.295804
8,5.241747,5.262827
9,7.131699,6.855211


#**CatBoost**

In [45]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [46]:
from catboost import CatBoostRegressor

In [53]:
cb_param_grid = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1]
}

catboost = CatBoostRegressor(random_state=42)
cb_cv = KFold(n_splits=5, shuffle=True, random_state=42)

cb_grid_search = GridSearchCV(estimator=catboost,
                              param_grid=cb_param_grid,
                              cv=cb_cv,
                              scoring='r2',
                              n_jobs=-1)


In [54]:
cb_grid_search.fit(X_train, y_train)
cb_grid_search.best_params_

0:	learn: 1.2692333	total: 48.6ms	remaining: 14.5s
1:	learn: 1.2261024	total: 50.4ms	remaining: 7.52s
2:	learn: 1.1866766	total: 52.1ms	remaining: 5.16s
3:	learn: 1.1484745	total: 53.8ms	remaining: 3.98s
4:	learn: 1.1101752	total: 55.4ms	remaining: 3.27s
5:	learn: 1.0732558	total: 55.9ms	remaining: 2.74s
6:	learn: 1.0386710	total: 57.6ms	remaining: 2.41s
7:	learn: 1.0082773	total: 59.3ms	remaining: 2.16s
8:	learn: 0.9754368	total: 60.9ms	remaining: 1.97s
9:	learn: 0.9449714	total: 62.5ms	remaining: 1.81s
10:	learn: 0.9182082	total: 64.1ms	remaining: 1.68s
11:	learn: 0.8942790	total: 65.8ms	remaining: 1.58s
12:	learn: 0.8668606	total: 67.4ms	remaining: 1.49s
13:	learn: 0.8435338	total: 69ms	remaining: 1.41s
14:	learn: 0.8187948	total: 70.6ms	remaining: 1.34s
15:	learn: 0.7960819	total: 72.2ms	remaining: 1.28s
16:	learn: 0.7729741	total: 73.9ms	remaining: 1.23s
17:	learn: 0.7475525	total: 75.6ms	remaining: 1.18s
18:	learn: 0.7273835	total: 77.3ms	remaining: 1.14s
19:	learn: 0.7073341	tot

{'depth': 6, 'iterations': 300, 'learning_rate': 0.05}

In [55]:
cb_grid_search.best_score_

np.float64(0.9420986757191944)

approximately 0.942 is good. Better thsn Random forrest and XGBoost

In [62]:
y_pred_cb = cb_grid_search.best_estimator_.predict(X_test)

cb_results_df = pd.DataFrame({
    'True Values': y_test.values,
    'CatBoost Predictions': y_pred_cb
})

cb_results_df


Unnamed: 0,True Values,CatBoost Predictions
0,4.369448,4.507632
1,2.667228,2.560097
2,5.303305,5.411221
3,5.602119,5.61636
4,5.01728,5.07306
5,6.908755,6.813361
6,2.079442,2.171003
7,5.198497,5.29561
8,5.241747,5.229392
9,7.131699,7.171411


#**LightGBM**

In [57]:
from lightgbm import LGBMRegressor

In [59]:
lgbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100]
}

lgbm = LGBMRegressor(random_state=42)
# KFold cross-validation
lgbm_cv = KFold(n_splits=5, shuffle=True, random_state=42)

lgbm_grid_search = GridSearchCV(estimator=lgbm,
                            param_grid=lgbm_param_grid,
                            cv=lgbm_cv,
                            scoring='r2',
                            n_jobs=-1)

lgbm_grid_search.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000065 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 541
[LightGBM] [Info] Number of data points in the train set: 127, number of used features: 15
[LightGBM] [Info] Start training from score 5.387348


In [60]:
lgbm_grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 300, 'num_leaves': 31}

In [61]:
lgbm_grid_search.best_score_

np.float64(0.8939851375831047)

In [64]:

y_pred_lgbm = lgbm_grid_search.best_estimator_.predict(X_test)
lgbm_results_df = pd.DataFrame({
    'True Values': y_test.values,
    'LightGBM Predictions': y_pred_lgbm
})
lgbm_results_df.head()



Unnamed: 0,True Values,LightGBM Predictions
0,4.369448,4.105391
1,2.667228,2.465908
2,5.303305,4.733946
3,5.602119,5.676005
4,5.01728,5.094005


In [116]:
# gather the predictions for each models
combined_results = pd.DataFrame({
    'True Values': y_test.values,
    'RF Predictions': y_pred,
    'XGB Predictions': y_pred_xg,
    'CatBoost Predictions': y_pred_cb,
    'LightGBM Predictions': y_pred_lgbm
})

combined_results.head(10)


Unnamed: 0,True Values,RF Predictions,XGB Predictions,CatBoost Predictions,LightGBM Predictions
0,4.369448,4.420988,4.408215,4.507632,4.105391
1,2.667228,2.520085,2.55428,2.560097,2.465908
2,5.303305,5.394687,5.661238,5.411221,4.733946
3,5.602119,5.633193,5.676934,5.61636,5.676005
4,5.01728,4.930921,4.924502,5.07306,5.094005
5,6.908755,6.75328,6.778795,6.813361,6.871351
6,2.079442,2.22408,2.178085,2.171003,2.465908
7,5.198497,5.374215,5.295804,5.29561,5.368026
8,5.241747,5.273262,5.262827,5.229392,5.259517
9,7.131699,7.226163,6.855211,7.171411,7.035076


In [66]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [68]:
# Compute performance metrics for each model
metrics = []
models = ['RandomForest', 'XGBoost', 'CatBoost', 'LightGBM']
predictions = [y_pred, y_pred_xg, y_pred_cb, y_pred_lgbm]

for model, preds in zip(models, predictions):
    r2 = r2_score(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, preds)
    metrics.append({'Model': model, 'R2': r2, 'MSE': mse, 'RMSE': rmse, 'MAE': mae})

In [69]:
metrics_df = pd.DataFrame(metrics)

metrics_df

Unnamed: 0,Model,R2,MSE,RMSE,MAE
0,RandomForest,0.992907,0.016665,0.129094,0.103503
1,XGBoost,0.993127,0.016147,0.127071,0.092578
2,CatBoost,0.996377,0.008511,0.092257,0.076122
3,LightGBM,0.984864,0.035561,0.188575,0.138372


In [81]:
import plotly.express as px

In [88]:
!pip install git+https://github.com/plotly/Kaleido.git

Collecting git+https://github.com/plotly/Kaleido.git
  Cloning https://github.com/plotly/Kaleido.git to /tmp/pip-req-build-qj14xjvd
  Running command git clone --filter=blob:none --quiet https://github.com/plotly/Kaleido.git /tmp/pip-req-build-qj14xjvd
  Resolved https://github.com/plotly/Kaleido.git to commit d93391ebabd78c7afb2ef09458b62408a4734101
[31mERROR: git+https://github.com/plotly/Kaleido.git does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [91]:
fig_rf = px.scatter(x=y_test, y=y_pred,
                    labels={'x': 'Real Values', 'y': 'Predictions'},
                    title='Parity Plot - RandomForest')
fig_rf.add_shape(type='line', x0=min(y_test), y0=min(y_test),
                 x1=max(y_test), y1=max(y_test),
                 line=dict(color='red', dash='dash'))
fig_rf.write_json('parity_rf.json')

In [92]:
fig_rf.show()

In [93]:
fig_gx = px.scatter(x=y_test, y=y_pred_xg,
                    labels={'x': 'Real Values', 'y': 'Predictions'},
                    title='Parity Plot - XGBOOST')
fig_gx.add_shape(type='line', x0=min(y_test), y0=min(y_test),
                 x1=max(y_test), y1=max(y_test),
                 line=dict(color='red', dash='dash'))
fig_gx.write_json('parity_gx.json')

In [94]:
fig_gx.show()

In [95]:
fig_cb = px.scatter(x=y_test, y=y_pred_cb,
                    labels={'x': 'Real Values', 'y': 'Predictions'},
                    title='Parity Plot - CatBoost')
fig_cb.add_shape(type='line', x0=min(y_test), y0=min(y_test),
                 x1=max(y_test), y1=max(y_test),
                 line=dict(color='red', dash='dash'))
fig_cb.write_json('parity_cb.json')

In [96]:
fig_cb.show()

In [97]:
fig_lgbm = px.scatter(x=y_test, y=y_pred_lgbm,
                    labels={'x': 'Real Values', 'y': 'Predictions'},
                    title='Parity Plot - LightGBM')
fig_lgbm.add_shape(type='line', x0=min(y_test), y0=min(y_test),
                 x1=max(y_test), y1=max(y_test),
                 line=dict(color='red', dash='dash'))
fig_lgbm.write_json('parity_lgbm.json')

In [98]:
fig_lgbm.show()

In [99]:
# Barchart
top3 = metrics_df.sort_values(by='R2', ascending=False).head(3)
fig_bar = px.bar(top3, x='Model', y='R2',
                 title='R2 values of the best tree models',
                 labels={'Model': 'Model', 'R2': 'R2 Score'},
                 text='R2')
fig_bar.write_json('bar_chart.json')
fig_bar.show()

In [101]:
!pip install dash==2.17.1

Collecting dash==2.17.1
  Downloading dash-2.17.1-py3-none-any.whl.metadata (10 kB)
Collecting Flask<3.1,>=1.0.4 (from dash==2.17.1)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.1 (from dash==2.17.1)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting dash-html-components==2.0.0 (from dash==2.17.1)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash==2.17.1)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash==2.17.1)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting retrying (from dash==2.17.1)
  Downloading retrying-1.4.2-py3-none-any.whl.metadata (5.5 kB)
Downloading dash-2.17.1-py3-none-any.whl (7.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dash_core_components-2.0

In [112]:
feature_names = list(X_encoded.columns)  # Encoding sonrası kolon isimleri
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)
preprocessed_df = X_scaled_df.copy()
preprocessed_df['log_Weight'] = y.values
preprocessed_df['Weight'] = df['Weight'].values

In [128]:
selected_model = ['CatBoost', 'XGB', 'RF'] # Adjusted to match actual column name prefixes

for i in selected_model:
  # Use 'i' (individual model name) to construct the column name
  forecast_df = combined_results[['True Values', f'{i} Predictions']].copy()
  forecast_df['Absolute Error'] = abs(forecast_df['True Values'] - forecast_df[f'{i} Predictions'])
  forecast_df = forecast_df.sort_values(by='Absolute Error', ascending=False)

forecast_df.head(10)

Unnamed: 0,True Values,RF Predictions,Absolute Error
18,5.420535,5.118482,0.302053
22,6.216606,5.954162,0.262444
13,5.01728,4.774452,0.242828
30,6.23637,6.0098,0.22657
28,2.04122,2.225972,0.184751
7,5.198497,5.374215,0.175718
5,6.908755,6.75328,0.155475
17,5.389072,5.536866,0.147795
1,2.667228,2.520085,0.147144
6,2.079442,2.22408,0.144638


In [129]:
best_params = {
    'RandomForest': {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2},
    'XGBoost': {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1},
    'CatBoost': {'iterations': 200, 'depth': 6, 'learning_rate': 0.1},
    'LightGBM': {'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.05, 'num_leaves': 50}
}

In [134]:
import dash
from dash import dcc, html, dash_table
from dash.dependencies import Input, Output
import plotly.express as px
import plotly.io as pio

app = dash.Dash(__name__)

In [136]:
# Layout
app.layout = html.Div([
    html.H1("Regression Dashboard – Fish Market Dataset"),
    html.Div("Fehime Capar | 605437"),


    html.Div([
        html.Label("Model Selection:"),
        dcc.RadioItems(
            id='model-selection',
            options=[
                {'label': 'CatBoost', 'value': 'CatBoost'},
                {'label': 'XGBoost', 'value': 'XGBoost'},
                {'label': 'RandomForest', 'value': 'RandomForest'}
            ],
            value='CatBoost'
        ),
        html.Label("Metric Selection:"),
        dcc.Dropdown(
            id='metric-selection',
            options=[
                {'label': 'R²', 'value': 'R2'},
                {'label': 'MSE', 'value': 'MSE'},
                {'label': 'RMSE', 'value': 'RMSE'},
                {'label': 'MAE', 'value': 'MAE'}
            ],
            multi=True,
            value=['R2']
        )
    ], style={'margin-bottom': '20px'}),

    # Tabs
    dcc.Tabs([
        dcc.Tab(label='Data Preview', children=[
            html.H3("Preprocessed Data"),
            dash_table.DataTable(id='data-preview', page_size=10)
        ]),
        dcc.Tab(label='Performance Metrics', children=[
            html.H3("Performance Metrics"),
            dash_table.DataTable(id='metrics-table'),
            dcc.Graph(id='metrics-bar')
        ]),
        dcc.Tab(label='Forecasting Results', children=[
            html.H3("Forecasting Results"),
            dash_table.DataTable(id='forecast-table', page_size=10)
        ]),
        dcc.Tab(label='Parity Plot', children=[
            html.H3("Parity Plot"),
            dcc.Graph(id='parity-plot'),
            html.Div(id='hyperparams')
        ])
    ])
])

# Callback
@app.callback(
    [Output('metrics-table', 'data'),
     Output('metrics-bar', 'figure'),
     Output('parity-plot', 'figure'),
     Output('forecast-table', 'data'),
     Output('data-preview', 'data'),
     Output('hyperparams', 'children')],
    [Input('model-selection', 'value'),
     Input('metric-selection', 'value')]
)
def update_dashboard(selected_model, selected_metrics):
  for i in selected_model:
      # Performance metrics table
      filtered_metrics = metrics_df[metrics_df['Model'] == selected_model][selected_metrics]
      metrics_table = filtered_metrics.to_dict('records')

      # Bar plot
      fig_bar = px.bar(x=selected_metrics, y=filtered_metrics.values[0],
                      labels={'x': 'Metric', 'y': 'Value'},
                      title=f'{selected_model} Performance')

      # Parity plot (from JSON)
      fig_parity = pio.read_json(f'parity_{selected_model.lower()}.json')

      # 4. Forecasting table
      forecast_df = combined_results[['True Values', f'{selected_model} Predictions']].copy()
      forecast_df['Absolute Error'] = abs(forecast_df['True Values'] - forecast_df[f'{selected_model} Predictions'])
      forecast_df = forecast_df.sort_values(by='Absolute Error', ascending=False)
      forecast_table = forecast_df.to_dict('records')

      # 5. Data preview
      data_preview = preprocessed_df.head(10).to_dict('records')

      # 6. Hyperparameter
      hyperparams_text = f"The best parameters: {best_params[selected_model]}"

      return metrics_table, fig_bar, fig_parity, forecast_table, data_preview, hyperparams_text

# Run
if __name__ == '__main__':
    app.run_server(debug=True)


AssertionError: The setup method 'errorhandler' can no longer be called on the application. It has already handled its first request, any changes will not be applied consistently.
Make sure all imports, decorators, functions, etc. needed to set up the application are done before running it.