In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform
from scipy.stats import randint
import xgboost

import midihum_model

In [None]:
train_df = pd.read_parquet("dfs/train_data.parquet.gzip").sample(n=500_000)
test_df = pd.read_parquet("dfs/validate_data.parquet.gzip").sample(n=50_000)
train_df

In [None]:
train_df = train_df.drop(["midi_track_index", "midi_event_index", "name"], axis=1)
test_df = test_df.drop(["midi_track_index", "midi_event_index", "name"], axis=1)

In [None]:
cat_names, cont_names, out_names = midihum_model.MidihumModel._get_column_names_from_df(train_df)
std_scaler = StandardScaler()
std_scaler.fit(train_df[cont_names + out_names])
print("std_scaler", std_scaler.mean_[:10], std_scaler.var_[:10])
train_df[cont_names + out_names] = std_scaler.transform(train_df[cont_names + out_names])
test_df[cont_names + out_names] = std_scaler.transform(test_df[cont_names + out_names])

In [None]:
for col in cat_names:
    train_df[col] = train_df[col].astype("category")
    test_df[col] = test_df[col].astype("category")

In [None]:
X_train = train_df.drop(["velocity"], axis=1)
y_train = train_df["velocity"]
X_test = test_df.drop(["velocity"], axis=1)
y_test = test_df["velocity"]
X_train

In [None]:
model = xgboost.XGBRegressor(
    booster="gbtree", learning_rate=0.05, gamma=0.9, subsample=0.8, colsample_bytree=0.9, reg_alpha=0.8, reg_lambda=0.5, 
    n_jobs=16, enable_categorical=True)
distributions = dict(
    n_estimators=randint(100, 1500),
    max_depth=randint(2, 9),
    #learning_rate=loguniform(0.01, 1)
    #gamma=uniform(0, 1),
    min_child_weight=uniform(1, 15)
    #subsample=uniform(loc=0.5, scale=0.5),
    #colsample_bytree=uniform(loc=0.5, scale=0.5),
    #reg_alpha=uniform(0, 1),
    #reg_lambda=uniform(0, 1)
)
clf = RandomizedSearchCV(model, distributions, n_iter=25, random_state=0, verbose=2)
search = clf.fit(X_train, y_train)
search.best_params_

In [None]:
pd.set_option("display.max_colwidth", None)
pd.DataFrame(search.cv_results_)[["mean_fit_time", "params", "mean_test_score", "rank_test_score"]]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

vars = [
    "n_estimators",
    "max_depth",
#    "learning_rate",
#    "gamma",
    "min_child_weight"
#    "subsample",
#    "colsample_bytree",
#    "reg_alpha",
#    "reg_lambda"
]
vars = [f"param_{var}" for var in vars]

search_df = pd.DataFrame(search.cv_results_)
search_df[vars + ["mean_test_score"]] = search_df[vars + ["mean_test_score"]].astype(float)
for var in vars:
    sns.lmplot(data=search_df, x=var, y="mean_test_score")
plt.show()