In [None]:
import tqdm

import numpy as np
import pandas as pd
import seaborn as sns  
import matplotlib.pyplot as plt

from scipy.stats import pearsonr, probplot

In [None]:
MODE_MODEL = "decision-tree"

## Data Loading

In [None]:
df = pd.read_csv("../data/ready-20241123.csv", sep=";", decimal=",", parse_dates=["Tanggal Mikrotest"], dayfirst=True)
df = df[df["Hb Gold"] > 0]
df.head()

In [None]:
df["ID Alat"].unique()

## EDA

In [None]:
plt.figure(figsize=(20, 4))
plt.plot(df.iloc[0, 9:].values)

In [None]:
baseline = np.mean(df.iloc[0, 9:].values[:2000])
segments = np.split(df.iloc[0, 9:].values[2000:], 30)
baseline, len(segments), segments[0].shape

In [None]:
corrected_segments = np.array([segment - baseline for segment in segments]).ravel()
corrected_segments

In [None]:
plt.figure(figsize=(20, 4))
plt.plot(range(corrected_segments.shape[0]), corrected_segments)

## Preprocessing

In [None]:
X = df.iloc[:, 9:].fillna(0).values
y = df.iloc[:, 6].values

X.shape, y.shape

In [None]:
# baseline correction
X_corrected = X[:, 2000:] - np.mean(X[:, :2000], axis=1).reshape(-1, 1)
X_corrected.shape

In [None]:
X_corr = []
for i in tqdm.trange(X.shape[1]):
    r, p = pearsonr(X[:, i], y)
    X_corr.append((i, r))

sr_corr = pd.Series([x[1] for x in X_corr], index=[x[0] for x in X_corr])
sr_corr.head()

In [None]:
sr_corr.sort_values(ascending=False)

In [None]:
cols_corr = sr_corr.index.tolist()
cols = [*cols_corr[1:4], *cols_corr[-3:]]
cols

In [None]:
# get X, y
X_sel = X[:, cols]
X_sel.shape

## Regressor

### Cross-Validation

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

from xgboost import XGBRegressor
from lightgbm.sklearn import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [None]:
def create_model():
    if MODE_MODEL == "decision-tree":
        return DecisionTreeRegressor()
# reg = RandomForestRegressor(random_state=24)
# reg = XGBRegressor()
# reg = LGBMRegressor()
# reg = VotingRegressor([("xgb", XGBRegressor()), ("lgbm", LGBMRegressor())], weights=[0.7, 0.3])

In [None]:
scores = cross_validate(create_model(), X_sel, y, scoring=["r2", "neg_mean_absolute_error", "neg_mean_squared_error", "neg_root_mean_squared_error"])
scores_df = pd.DataFrame(scores)
scores_df

In [None]:
scores_df.mean()

### Hold-out

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.3, random_state=24)

reg.fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_test)
resid = y_test - y_pred

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(8, 3))
ax[0].scatter(y_test, y_pred, alpha=0.5)
ax[0].set_title("Actual vs Predicted")
ax[0].set_xlabel("Hb predicted")
ax[0].set_ylabel("Hb gold")

ax[1].set_title("Residuals")
ax[1].scatter(resid, y_pred, alpha=0.5)
ax[1].set_xlabel("Hb gold")
ax[1].set_ylabel("Predicted residuals")

fig.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(8, 3))

probplot(y_pred, dist="norm", plot=ax[0])
ax[0].set_title("Probability plot of the predcited Hb")

probplot(resid, dist="norm", plot=ax[1])
ax[1].set_title("Probability plot of the residuals of predcited Hb")

fig.tight_layout()
plt.show()

### Evaluate Performance

In [None]:
df_pred = pd.DataFrame({
    "hb_gold": y_test,
    "hb_pred" : reg.predict(X_test)
}).sort_values("hb_gold")

# df_pred = pd.DataFrame({
#     "hb_gold": y,
#     "hb_pred" : reg.predict(X[:, cols])
# }).sort_values("hb_gold")

In [None]:
print(f"R2 = {r2_score(df_pred['hb_gold'], df_pred['hb_pred']):.4f}")
print(f"MAE = {mean_absolute_error(df_pred['hb_gold'], df_pred['hb_pred']):.4f}")
print(f"MSE = {mean_squared_error(df_pred['hb_gold'], df_pred['hb_pred']):.4f}")
print(f"RMSE = {root_mean_squared_error(df_pred['hb_gold'], df_pred['hb_pred']):.4f}")

In [None]:
r, _ = pearsonr(df_pred["hb_gold"], df_pred["hb_pred"])
r2 = r2_score(df_pred["hb_gold"], df_pred["hb_pred"])

print("r", r)
print("R2", r2)

fig, ax = plt.subplots()  
ax.scatter(range(df_pred.shape[0]), df_pred["hb_gold"], c='b', alpha=0.7, label="Actual")
ax.scatter(range(df_pred.shape[0]), df_pred["hb_pred"], c='r', alpha=0.2, label="Predicted")
ax.legend()

ax.set_title(f"Actual and predicted Hb values ($r={r:.2f}$)")
ax.set_xlabel("Sample number")
ax.set_ylabel("Hb value")

plt.show()