In [None]:
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
import xgboost

import midihum_model

In [None]:
save_model = False # TODO: change to False by default

In [None]:
train_df = pd.read_parquet("dfs/train_data.parquet.gzip")
test_df = pd.read_parquet("dfs/validate_data.parquet.gzip")
train_df

In [None]:
train_df = train_df.drop(["midi_track_index", "midi_event_index", "name"], axis=1)
test_df = test_df.drop(["midi_track_index", "midi_event_index", "name"], axis=1)

In [None]:
cat_names, cont_names, out_names = midihum_model.MidihumModel._get_column_names_from_df(train_df)
std_scaler = StandardScaler()
std_scaler.fit(train_df[cont_names + out_names])
print("std_scaler", std_scaler.mean_[:10], std_scaler.var_[:10])
train_df[cont_names + out_names] = std_scaler.transform(train_df[cont_names + out_names])
test_df[cont_names + out_names] = std_scaler.transform(test_df[cont_names + out_names])

In [None]:
if save_model:
    with open(midihum_model.MidihumModel.scaler_path, "wb") as f:
        pickle.dump(std_scaler, f)

In [None]:
for col in cat_names:
    train_df[col] = train_df[col].astype("category")
    test_df[col] = test_df[col].astype("category")

In [None]:
X_train = train_df.drop(["velocity"], axis=1)
y_train = train_df["velocity"]
X_test = test_df.drop(["velocity"], axis=1)
y_test = test_df["velocity"]
X_train

In [None]:
model = xgboost.XGBRegressor(
    booster="gbtree", max_depth=8, learning_rate=0.05, n_estimators=1400, gamma=0.9, min_child_weight=12, subsample=0.8, colsample_bytree=0.9,
    reg_alpha=0.8, reg_lambda=0.5, n_jobs=16, enable_categorical=True)
model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)

In [None]:
from sklearn.metrics import mean_squared_error
train_predictions = model.predict(X_train)
train_rmse = mean_squared_error(y_train, train_predictions, squared=False)
print("Train RMSE: %.3f" % train_rmse)

test_predictions = model.predict(X_test)
test_rmse = mean_squared_error(y_test, test_predictions, squared=False)
print("Test RMSE: %.3f" % test_rmse)

In [None]:
if save_model:
    model.save_model(midihum_model.MidihumModel.model_path)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

results = model.evals_result()
plt.plot(results["validation_0"]["rmse"], label="train")
plt.plot(results["validation_1"]["rmse"], label="test")
plt.legend()
plt.show()

In [None]:
plot_df = X_test.copy()
plot_df["velocity"] = y_test
plot_df["prediction"] = test_predictions
g = sns.lmplot(data=plot_df.sample(500), x="velocity", y="prediction", palette="bright")
plt.show()
print(plot_df["prediction"])

In [None]:
col_idx = train_df.columns.get_loc("velocity")
velocities = midihum_model.MidihumModel._rescale_predictions(std_scaler, test_predictions)
g = sns.displot(velocities)
plt.show()

In [None]:
importance_map = dict(zip(train_df.columns, model.feature_importances_))
sorted(importance_map.items(), key=lambda x: x[1], reverse=True)