In [None]:
!pip install catboost




In [None]:
# Step 1: Load and Preprocess Data
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import plotly.graph_objects as go
import os

# Load dataset
data_path = 'kc_house_data.csv'
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Dataset not found! Upload the file: '{data_path}'")

data = pd.read_csv(data_path)

# Preprocessing: Keep numerical columns and remove irrelevant columns
data = data.select_dtypes(include=[np.number])
data = data.drop(columns=['id', 'date'], errors='ignore')

# Splitting features (X) and target (y)
X = data.drop(columns=['price'])
y = data['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=0.7, random_state=42)


In [None]:
# Step 2: Tune Models with GridSearchCV
def tune_model(model, param_grid, X_train_sample, y_train_sample):
    print(f"Tuning {model.__class__.__name__}...")
    grid_search = GridSearchCV(
        model, param_grid,
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        scoring='r2',
        verbose=1,  # Show progress during GridSearchCV
        n_jobs=-1    # Use all available CPU cores
    )
    grid_search.fit(X_train_sample, y_train_sample)
    print(f"Best params for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

#Step 1 Model definitions and hyperparameter grids
models = {
    "Random Forest": tune_model(RandomForestRegressor(n_jobs=-1),
                                 {"n_estimators": [50, 100], "max_depth": [10, None]},
                                 X_train_sample, y_train_sample),
    "XGBoost": tune_model(XGBRegressor(),
                           {"n_estimators": [50, 100], "max_depth": [5, 10]},
                           X_train_sample, y_train_sample),
    "CatBoost": tune_model(CatBoostRegressor(verbose=0),
                            {"iterations": [50, 100], "depth": [6, 10]},
                            X_train_sample, y_train_sample),
    "LightGBM": tune_model(LGBMRegressor(),
                            {"n_estimators": [50, 100], "num_leaves": [31]},
                            X_train_sample, y_train_sample)
}


Tuning RandomForestRegressor...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best params for RandomForestRegressor: {'max_depth': None, 'n_estimators': 50}
Tuning XGBRegressor...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best params for XGBRegressor: {'max_depth': 5, 'n_estimators': 100}
Tuning CatBoostRegressor...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best params for CatBoostRegressor: {'depth': 6, 'iterations': 100}
Tuning LGBMRegressor...
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2238
[LightGBM] [Info] Number of data points in the train set: 5187, number of used features: 18
[LightGBM] [Info] Start training from score 533785.522267
Best params for LGBMRegressor


invalid value encountered in cast



In [None]:
# Step 3: Generate Predictions DataFrame
predictions_df = pd.DataFrame(y_test.values, columns=['Real Values'])
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    predictions_df[model_name] = y_pred

print("\nPredictions DataFrame:")
print(predictions_df.head())

# Save the predictions dataframe as a CSV
predictions_df.to_csv("predictions_output.csv", index=False)

# Visualize Predictions
fig = go.Figure()
fig.add_trace(go.Scatter(y=predictions_df['Real Values'], mode='lines', name='Real Values'))
for model_name in models.keys():
    fig.add_trace(go.Scatter(y=predictions_df[model_name], mode='lines', name=f'{model_name} Predictions'))

fig.update_layout(title="Model Predictions vs Real Values",
                  xaxis_title="Observations",
                  yaxis_title="Values")
fig.show()


Predictions DataFrame:
   Real Values  Random Forest       XGBoost      CatBoost      LightGBM
0     365000.0      350524.00  3.948613e+05  3.927081e+05  3.889451e+05
1     865000.0      805793.08  9.367337e+05  8.843614e+05  8.740346e+05
2    1038000.0     1032494.80  1.281987e+06  1.052545e+06  1.073038e+06
3    1490000.0     1950086.60  2.115926e+06  1.996047e+06  1.850377e+06
4     711000.0      701160.00  7.829231e+05  7.193429e+05  7.213338e+05


In [None]:
## Step 5: Output Metrics for Training and Test Sets
metrics = ['R2', 'MSE', 'MAE']
results = []

for model_name, model in models.items():
    # Training metrics
    y_train_pred = model.predict(X_train)
    train_r2 = r2_score(y_train, y_train_pred)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)

    # Testing metrics
    y_test_pred = model.predict(X_test)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    results.append([model_name, 'Train', train_r2, train_mse, train_mae])
    results.append([model_name, 'Test', test_r2, test_mse, test_mae])

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'Dataset', 'R2', 'MSE', 'MAE'])
print("\nPerformance Metrics:")
print(results_df)

# Save metrics to CSV
results_df.to_csv("model_metrics_output.csv", index=False)

# Visualize metrics using Plotly
for metric in metrics:
    fig = go.Figure()
    for dataset in ['Train', 'Test']:
        subset = results_df[results_df['Dataset'] == dataset]
        fig.add_trace(go.Bar(
            x=subset['Model'],
            y=subset[metric],
            name=f'{dataset} {metric}'
        ))
    fig.update_layout(
        title=f"{metric} Comparison for Training and Test Sets",
        xaxis_title="Model",
        yaxis_title=metric
    )
    fig.show()



Performance Metrics:
           Model Dataset        R2           MSE           MAE
0  Random Forest   Train  0.882670  1.532901e+10  61530.857460
1  Random Forest    Test  0.831116  2.553132e+10  80432.019224
2        XGBoost   Train  0.896343  1.354259e+10  62235.369961
3        XGBoost    Test  0.836696  2.468783e+10  78418.902757
4       CatBoost   Train  0.895177  1.369499e+10  65693.298160
5       CatBoost    Test  0.870154  1.962963e+10  76454.521507
6       LightGBM   Train  0.884020  1.515263e+10  64336.679488
7       LightGBM    Test  0.857338  2.156710e+10  76387.672939


In [None]:
import plotly.express as px

# Step 6: Create Parity Plots for All Models
for model_name, model in models.items():
    # Generate predictions on the test set
    y_test_pred = model.predict(X_test)

    # Create a DataFrame with actual and predicted values
    parity_df = pd.DataFrame({
        "Actual": y_test,
        "Predicted": y_test_pred
    })

    # Create a scatter plot for actual vs predicted values
    fig_parity = px.scatter(
        parity_df,
        x="Actual",
        y="Predicted",
        title=f"Parity Plot for {model_name}",
        trendline="ols",  # Adding a trendline to show the fit
        labels={"Actual": "Actual Values", "Predicted": "Predicted Values"}
    )

    # Update layout for better readability
    fig_parity.update_layout(
        xaxis_title="Actual Values",
        yaxis_title="Predicted Values",
        showlegend=True
    )

    # Show the plot
    fig_parity.show()


In [None]:
# Step 7: Select Top 3 Models Based on R2 and Plot
# Filter Test Set Metrics
test_metrics = results_df[results_df['Dataset'] == 'Test']
top_3_models = test_metrics.nlargest(3, 'R2')

# Plot R2 for Top 3 Models
fig_top3 = go.Figure()
fig_top3.add_trace(go.Bar(
    x=top_3_models['Model'],
    y=top_3_models['R2'],
    name='R2 Scores'
))
fig_top3.update_layout(
    title="Top 3 Models Based on R2",
    xaxis_title="Model",
    yaxis_title="R2 Score"
)
fig_top3.show()

print("\nTop 3 Models Based on R2:")
print(top_3_models)



Top 3 Models Based on R2:
      Model Dataset        R2           MSE           MAE
5  CatBoost    Test  0.870154  1.962963e+10  76454.521507
7  LightGBM    Test  0.857338  2.156710e+10  76387.672939
3   XGBoost    Test  0.836696  2.468783e+10  78418.902757


In [None]:
import ipywidgets as widgets
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Title of the dashboard
dash_title = "ECE GÖKALP "

# Display the title
display(HTML(f"<h1 style='text-align: center; color: #4CAF50;'>{dash_title}</h1>"))

# Top 3 Models based on R2 metrics and their hyperparameters
catboost_metrics = {
    "Train R2": 0.895177, "Test R2": 0.870154,
    "Train MSE": 1.369499e+10, "Test MSE": 1.962963e+10,
    "Train MAE": 65693.298160, "Test MAE": 76454.521507
}
lightgbm_metrics = {
    "Train R2": 0.884020, "Test R2": 0.857338,
    "Train MSE": 1.515263e+10, "Test MSE": 2.156710e+10,
    "Train MAE": 64336.679488, "Test MAE": 76387.672939
}
random_forest_metrics = {
    "Train R2": 0.884338, "Test R2": 0.838555,
    "Train MSE": 1.511104e+10, "Test MSE": 2.440670e+10,
    "Train MAE": 60691.617511, "Test MAE": 79765.725825
}

# Models and their hyperparameters
models = {
    "CatBoost": (catboost_metrics, {"depth": 6, "iterations": 100}),
    "LightGBM": (lightgbm_metrics, {"num_leaves": 31, "n_estimators": 100}),
    "Random Forest": (random_forest_metrics, {"max_depth": None, "n_estimators": 100})
}

# Preprocessed Data (replace with actual preprocessed data)
data = pd.DataFrame({
    'Feature1': np.random.rand(100),
    'Feature2': np.random.rand(100),
    'Feature3': np.random.rand(100),
    'Target': np.random.rand(100)
})

# Forecasting Results
forecast_results = pd.DataFrame({
    "True Value": np.random.rand(10),
    "Forecasted Value": np.random.rand(10),
    "Absolute Error": np.abs(np.random.rand(10) - np.random.rand(10))
}).sort_values(by="Absolute Error", ascending=False)

# Widgets
model_select = widgets.Select(
    options=list(models.keys()),
    value="CatBoost",
    description="Select Model:"
)

metrics_checkboxes = widgets.SelectMultiple(
    options=["Train R2", "Test R2", "Train MSE", "Test MSE", "Train MAE", "Test MAE"],
    value=["Test R2", "Test MSE"],
    description="Select Metrics:"
)

# Tabs
output_tab1 = widgets.Output()
output_tab2 = widgets.Output()
output_tab3 = widgets.Output()

def update_tab1():
    with output_tab1:
        output_tab1.clear_output()
        selected_model = model_select.value
        metrics, hyperparameters = models[selected_model]
        print(f"Selected Model: {selected_model}")
        print(f"Hyperparameters: {hyperparameters}")
        print("\nPreprocessed Data Head:")
        print(data.head())

def update_tab2():
    with output_tab2:
        output_tab2.clear_output()
        selected_model = model_select.value
        selected_metrics = metrics_checkboxes.value
        metrics, _ = models[selected_model]
        print("\nPerformance Metrics:")
        for metric in selected_metrics:
            print(f"{metric}: {metrics[metric]}")

        # Visualization
        fig = go.Figure()
        for metric in selected_metrics:
            fig.add_trace(go.Bar(name=metric, x=[selected_model], y=[metrics[metric]]))
        fig.update_layout(title="Performance Metrics", xaxis_title="Model", yaxis_title="Metric Value")
        fig.show()

def update_tab3():
    with output_tab3:
        output_tab3.clear_output()
        print("\nForecasting Results:")
        print(forecast_results)

def update_dashboard(change=None):
    update_tab1()
    update_tab2()
    update_tab3()

# Observe changes
model_select.observe(update_dashboard, names="value")
metrics_checkboxes.observe(update_dashboard, names="value")

# Combine Tabs
tabs = widgets.Tab([output_tab1, output_tab2, output_tab3])
tabs.set_title(0, "Data & Hyperparameters")
tabs.set_title(1, "Metrics & Visualization")
tabs.set_title(2, "Forecasting Results")

# Display Dashboard
display(model_select, metrics_checkboxes, tabs)
update_dashboard()


Select(description='Select Model:', options=('CatBoost', 'LightGBM', 'Random Forest'), value='CatBoost')

SelectMultiple(description='Select Metrics:', index=(1, 3), options=('Train R2', 'Test R2', 'Train MSE', 'Test…

Tab(children=(Output(), Output(), Output()), _titles={'0': 'Data & Hyperparameters', '1': 'Metrics & Visualiza…