In [1]:
!pip install catboost



In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [3]:
selected_columns = [
    "CMRR", "PRSUP", "depth_of_ cover",
    "intersection_diagonal", "mining_hight",
    "roof_fall_rate", "fall"
]

df = pd.read_csv("original_data.csv")
df = df[selected_columns]

log_cols = ["CMRR","PRSUP","depth_of_ cover","intersection_diagonal","mining_hight"]
df[log_cols] = np.log1p(df[log_cols])

df.head()

Unnamed: 0,CMRR,PRSUP,depth_of_ cover,intersection_diagonal,mining_hight,roof_fall_rate,fall
0,4.094345,1.843719,5.01728,4.203199,1.94591,0.0,0
1,4.094345,1.843719,5.01728,4.203199,1.94591,0.0,0
2,3.931826,1.595339,5.993961,4.110874,2.079442,0.66,1
3,3.931826,1.931521,5.993961,4.110874,2.079442,1.08,1
4,4.330733,1.595339,5.993961,4.110874,2.079442,0.0,0


In [4]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in split.split(df, df["fall"]):
    train = df.loc[train_idx]
    test  = df.loc[test_idx]

X_train = train.drop(["roof_fall_rate", "fall"], axis=1)
y_train = train["roof_fall_rate"]

X_test  = test.drop(["roof_fall_rate", "fall"], axis=1)
y_test  = test["roof_fall_rate"]

In [5]:
num_pipe = Pipeline([
    ("scaler",  StandardScaler())
])

X_train_prep = num_pipe.fit_transform(X_train)
X_test_prep  = num_pipe.transform(X_test)

## CATBOOST MODEL

In [6]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(
    iterations=300,
    learning_rate=0.05,
    depth=4,
    loss_function='RMSE',
    random_state=42,
    verbose=False
)

cat.fit(X_train_prep, y_train)

pred = cat.predict(X_test_prep)

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

print("R2:",   round(r2_score(y_test, pred), 4))
print("MAE:",  round(mean_absolute_error(y_test, pred), 4))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, pred)), 4))


R2: -0.0476
MAE: 3.267
RMSE: 5.9352


In [7]:
from sklearn.model_selection import cross_validate

scoring = {
    "rmse": "neg_root_mean_squared_error",
    "mae": "neg_mean_absolute_error",
    "r2": "r2"
}

cv = cross_validate(cat, X_train_prep, y_train, cv=5, scoring=scoring)

print("\nCV RMSE Mean:", round(-cv["test_rmse"].mean(), 4))
print("CV RMSE Std :", round(cv["test_rmse"].std(), 4))
print("CV R2 Mean  :", round(cv["test_r2"].mean(), 4))


CV RMSE Mean: 6.5843
CV RMSE Std : 0.9749
CV R2 Mean  : -4.5927
