In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

In [None]:
df = pd.read_csv("df_ready.csv")

df_filter = df.filter(
    items = [
        "AgeMonthsInsemination",
        "Milk",
        "Fat",
        "Proteins",
        "ThiMax_dayless_5",
        "ThiMax_dayless_30",
        "ThiMax_dayless_60",
        "ThiMax_dayless_90",
        "ThiMax_dayless_120",
        "DaysOpen"
    ]
)

targets = [
    "DaysOpen"
]

X_train, X_test, y_train, y_test = train_test_split(
    df_filter[[column for column in df_filter.columns if column not in targets]].values,
    df_filter["DaysOpen"].values,
    test_size = 0.2,
    random_state = 42
)

kfold = KFold(
    n_splits = 5,
    shuffle = True,
    random_state = 42
)

gbm = GradientBoostingRegressor(
    loss = "squared_error",
    learning_rate = 0.1,
    n_estimators = 65,
    min_samples_leaf = 1,
    max_depth = 11,
    max_leaf_nodes = 1673
)

train_mae_list = []
test_mae_list = []

train_mse_list = []
test_mse_list = []

train_r2_list = []
test_r2_list = []

iteration = 1

for i, (train_index, test_index) in enumerate(kfold.split(X_train)):

    gbm.fit(
        X_train[train_index],
        y_train[train_index]
    )

    y_train_pred = gbm.predict(X_train[train_index])
    y_test_pred = gbm.predict(X_train[test_index])

    train_mae = mean_absolute_error(y_train[train_index], y_train_pred)
    train_mse = mean_squared_error(y_train[train_index], y_train_pred)
    train_r2 = r2_score(y_train[train_index], y_train_pred)

    test_mae = mean_absolute_error(y_train[test_index], y_test_pred)
    test_mse = mean_squared_error(y_train[test_index], y_test_pred)
    test_r2 = r2_score(y_train[test_index], y_test_pred)

    train_mae_list.append(train_mae)
    train_mse_list.append(train_mse)
    train_r2_list.append(train_r2)

    test_mae_list.append(test_mae)
    test_mse_list.append(test_mse)
    test_r2_list.append(test_r2)

    print(f"Iteration number: {iteration}\n")

    print(f"MAE on train set: {train_mae}")
    print(f"MSE on train set: {train_mse}")
    print(f"R2 on train set: {train_r2}\n")

    print(f"MAE on test set: {test_mae}")
    print(f"MSE on test set: {test_mse}")
    print(f"R2 on test set: {test_r2}")            
    print("----------------------------\n")

    iteration += 1

print(f"Average train MAE: {round(np.mean(train_mae_list), 3)} +- {round(np.std(train_mae_list), 3)}")
print(f"Average train MSE: {round(np.mean(train_mse_list), 3)} +- {round(np.std(train_mse_list), 3)}")
print(f"Average train R2: {round(np.mean(train_r2_list), 3)} +- {round(np.std(train_r2_list), 3)}\n")

print(f"Average test MAE: {round(np.mean(test_mae_list), 3)} +- {round(np.std(test_mae_list), 3)}")
print(f"Average test MSE: {round(np.mean(test_mse_list), 3)} +- {round(np.std(test_mse_list), 3)}")
print(f"Average test R2: {round(np.mean(test_r2_list), 3)} +- {round(np.std(test_r2_list), 3)}\n")

y_pred_test_unseen = gbm.predict(X_test)

print("Performance on unseen data:")
print(f"MAE on unseen test set: {mean_absolute_error(y_test, y_pred_test_unseen)}")
print(f"Naive benchmark model MAE: {mean_absolute_error(y_test, np.full(len(y_test), np.mean(y_test)))}\n")

print(f"MSE on unseen test set: {mean_squared_error(y_test, y_pred_test_unseen)}")
print(f"Naive benchmark model MSE: {mean_squared_error(y_test, np.full(len(y_test), np.mean(y_test)))}\n")

print(f"R2 on unseen test set: {r2_score(y_test, y_pred_test_unseen)}")
print(f"Naive benchmark model R2: {r2_score(y_test, np.full(len(y_test), np.mean(y_test)))}\n")

features = df_filter[[column for column in df_filter.columns if column not in targets]].columns
features_importance = gbm.feature_importances_
features_importance_perc = features_importance * 100

feature_importance_df = pd.DataFrame(
    {
        "Features" : features,
        "Importance" : features_importance,
        "Importance(%)" : features_importance_perc
    }
).sort_values(by = "Importance", ascending = False)

feature_importance_df.to_csv("feature_importance.csv", index = False)

with open("DO_predictor.pkl", "wb") as f:
    pickle.dump(gbm, f)