In [0]:
# ============================================================
# 1. ЗАВАНТАЖЕННЯ ДАНИХ
# ============================================================

delta_input_path = "/Volumes/workspace/default/olist_delta_3"
olist_df = spark.read.format("delta").load(delta_input_path)

print("Rows:", olist_df.count())
display(olist_df.limit(5))


In [0]:
# ============================================================
# 2. ВИБІР ФІЧ ТА ЦІЛЬОВОЇ ЗМІННОЇ
# ============================================================

from pyspark.sql.functions import col

target_col = "delivery_time_days_calc"

feature_cols = [
    "num_1", "num_2", "num_3", "num_4", "num_6",
    "delivery_diff_avg", "fast_delivery", "num_1_sq",
    "num2_num3_interaction", "num_mean_12", "log_num_1",
    "purchase_year", "purchase_month", "purchase_day"
]

olist_df = olist_df.filter(col(target_col).isNotNull())

# Перехід у pandas
pdf = olist_df.select(feature_cols + [target_col]).toPandas()

X = pdf[feature_cols].values
y = pdf[target_col].values

# ============================================================
# 3. РОЗБИТТЯ TRAIN / VALIDATION / TEST = 70% / 15% / 15%
# ============================================================

from sklearn.model_selection import train_test_split

# Спочатку train + temp (70% train, 30% temp)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42
)

# Тепер temp → validation + test (15% / 15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42
)

print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)


In [0]:
# ============================================================
# 4. МАСШТАБУВАННЯ (лише train — val/test трансформуються)
# ============================================================

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

results = []

# ---------- Лінійна регресія ----------
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

results.append((
    "Linear Regression",
    mean_squared_error(y_test, y_pred) ** 0.5,
    mean_absolute_error(y_test, y_pred),
    r2_score(y_test, y_pred)
))

# ---------- Decision Tree ----------
dt = DecisionTreeRegressor(max_depth=8, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

results.append((
    "Decision Tree",
    mean_squared_error(y_test, y_pred) ** 0.5,
    mean_absolute_error(y_test, y_pred),
    r2_score(y_test, y_pred)
))

# ---------- Random Forest ----------
rf = RandomForestRegressor(
    n_estimators=150,
    max_depth=10,
    random_state=42
)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

results.append((
    "Random Forest",
    mean_squared_error(y_test, y_pred) ** 0.5,
    mean_absolute_error(y_test, y_pred),
    r2_score(y_test, y_pred)
))

# ---------- Gradient Boosting ----------
gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)

results.append((
    "Gradient Boosting",
    mean_squared_error(y_test, y_pred) ** 0.5,
    mean_absolute_error(y_test, y_pred),
    r2_score(y_test, y_pred)
))


In [0]:
# ============================================================
# 6. ФІНАЛЬНА ТАБЛИЦЯ РЕЗУЛЬТАТІВ
# ============================================================

import pandas as pd

df_results = pd.DataFrame(results, columns=["Model", "RMSE", "MAE", "R2"])
display(df_results)


In [0]:
%pip install shap

In [0]:

# ============================================================
# 7. SHAP — ІНТЕРПРЕТАЦІЯ МОДЕЛІ (RandomForest)
# ============================================================


import shap
shap.initjs()

X_test_sample = X_test[:2000]

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test_sample)

shap.summary_plot(shap_values, X_test_sample, feature_names=feature_cols)


In [0]:

# ============================================================
# 5.1 ДОДАТКОВИЙ АНАЛІЗ: Метрики на TRAIN / VALIDATION / TEST
# ============================================================

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

def eval_regression(model, X_tr, y_tr, X_v, y_v, X_te, y_te, name):
    y_pred_train = model.predict(X_tr)
    y_pred_val   = model.predict(X_v)
    y_pred_test  = model.predict(X_te)

    return {
        "Model": name,

        "Train_RMSE": (mean_squared_error(y_tr, y_pred_train)) ** 0.5,
        "Train_MAE": mean_absolute_error(y_tr, y_pred_train),
        "Train_R2": r2_score(y_tr, y_pred_train),

        "Val_RMSE": (mean_squared_error(y_v, y_pred_val)) ** 0.5,
        "Val_MAE": mean_absolute_error(y_v, y_pred_val),
        "Val_R2": r2_score(y_v, y_pred_val),

        "Test_RMSE": (mean_squared_error(y_te, y_pred_test)) ** 0.5,
        "Test_MAE": mean_absolute_error(y_te, y_pred_test),
        "Test_R2": r2_score(y_te, y_pred_test),
    }

# Лінійна регресія
metrics_lr = eval_regression(
    lr,
    X_train_scaled, y_train,
    X_val_scaled, y_val,
    X_test_scaled, y_test,
    "Linear Regression"
)

# Decision Tree
metrics_dt = eval_regression(
    dt,
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    "Decision Tree"
)

# Random Forest
metrics_rf = eval_regression(
    rf,
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    "Random Forest"
)

# Gradient Boosting
metrics_gbr = eval_regression(
    gbr,
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    "Gradient Boosting"
)

df_split_metrics = pd.DataFrame([metrics_lr, metrics_dt, metrics_rf, metrics_gbr])
display(df_split_metrics)



In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd
import shap

# ------------------------------------------------------------
# 1. ACTUAL vs PREDICTED (Random Forest)
# ------------------------------------------------------------

y_pred_rf = rf.predict(X_test)

plt.figure(figsize=(7, 7))
plt.scatter(y_test, y_pred_rf, alpha=0.4)
plt.plot(
    [y_test.min(), y_test.max()],
    [y_test.min(), y_test.max()],
    'r--'
)
plt.xlabel("Real Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted — Random Forest")
plt.grid(True)
plt.show()


In [0]:

# ------------------------------------------------------------
# 2. РОЗПОДІЛ ПОМИЛОК (RESIDUALS)
# ------------------------------------------------------------

errors = y_test - y_pred_rf

plt.figure(figsize=(8, 6))
sns.histplot(errors, bins=50, kde=True)
plt.title("Distribution of Prediction Errors (Residuals)")
plt.xlabel("Error")
plt.ylabel("Frequency")
plt.show()



In [0]:

# ------------------------------------------------------------
# 3. RESIDUALS vs PREDICTED
# ------------------------------------------------------------

plt.figure(figsize=(7, 6))
plt.scatter(y_pred_rf, errors, alpha=0.4)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted")
plt.ylabel("Residual")
plt.title("Residuals vs Predicted")
plt.grid(True)
plt.show()



In [0]:
# ------------------------------------------------------------
# 4. FEATURE IMPORTANCE (Random Forest)
# ------------------------------------------------------------

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_cols)), importances[indices])
plt.xticks(range(len(feature_cols)),
           [feature_cols[i] for i in indices],
           rotation=45, ha='right')
plt.title("Feature Importance — Random Forest")
plt.show()



In [0]:
# ------------------------------------------------------------
# 5. LEARNING CURVE (Random Forest)
# ------------------------------------------------------------

from sklearn.model_selection import learning_curve

train_sizes, train_scores, val_scores = learning_curve(
    rf,
    X_train,
    y_train,
    cv=3,
    scoring='neg_mean_squared_error',
    train_sizes=np.linspace(0.1, 1.0, 7),
    n_jobs=-1
)

train_rmse = (-train_scores.mean(axis=1)) ** 0.5
val_rmse   = (-val_scores.mean(axis=1)) ** 0.5

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_rmse, 'o-', label="Train RMSE")
plt.plot(train_sizes, val_rmse, 'o-', label="Validation RMSE")
plt.xlabel("Training Size")
plt.ylabel("RMSE")
plt.title("Learning Curve — Random Forest")
plt.legend()
plt.grid(True)
plt.show()



In [0]:
# ------------------------------------------------------------
# 6. SHAP INTERPRETATION (Random Forest)
# ------------------------------------------------------------

shap.initjs()
X_sample = X_test[:1500]

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_sample)

shap.summary_plot(
    shap_values,
    X_sample,
    feature_names=feature_cols
)