In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:
selected_columns = [
    "CMRR", "PRSUP", "depth_of_ cover",
    "intersection_diagonal", "mining_hight",
    "roof_fall_rate", "fall"
]

df = pd.read_csv("original_data.csv")
df = df[selected_columns]

log_cols = ["CMRR","PRSUP","depth_of_ cover","intersection_diagonal","mining_hight"]
df[log_cols] = np.log1p(df[log_cols])

df.head()


Unnamed: 0,CMRR,PRSUP,depth_of_ cover,intersection_diagonal,mining_hight,roof_fall_rate,fall
0,4.094345,1.843719,5.01728,4.203199,1.94591,0.0,0
1,4.094345,1.843719,5.01728,4.203199,1.94591,0.0,0
2,3.931826,1.595339,5.993961,4.110874,2.079442,0.66,1
3,3.931826,1.931521,5.993961,4.110874,2.079442,1.08,1
4,4.330733,1.595339,5.993961,4.110874,2.079442,0.0,0


In [3]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in split.split(df, df["fall"]):
    train = df.loc[train_idx]
    test  = df.loc[test_idx]

X_train = train.drop(["roof_fall_rate", "fall"], axis=1)
y_train = train["roof_fall_rate"]

X_test  = test.drop(["roof_fall_rate", "fall"], axis=1)
y_test  = test["roof_fall_rate"]


In [4]:
num_pipe = Pipeline([
    ("scaler",  StandardScaler())
])

X_train_prep = num_pipe.fit_transform(X_train)
X_test_prep  = num_pipe.transform(X_test)


## lightgbm model

In [5]:
#!pip install lightgbm

In [6]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=3,
    num_leaves=15,
    min_child_samples=1,
    reg_lambda=1.0,
    reg_alpha=0.5,
    random_state=42
)

lgbm.fit(X_train_prep, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83
[LightGBM] [Info] Number of data points in the train set: 87, number of used features: 5
[LightGBM] [Info] Start training from score 2.564598


In [7]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

pred = lgbm.predict(X_test_prep)

print("R2:", r2_score(y_test, pred))
print("MAE:", mean_absolute_error(y_test, pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pred)))


R2: -0.21988895670533193
MAE: 3.5339765874553635
RMSE: 6.404829434711992


