In [32]:
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

seed = 42

## Dataset loading (read generated pkl files)

In [33]:
BP_train = pd.read_pickle("dataset/BP_train_solution.pkl")
BP_test = pd.read_pickle("dataset/BP_test_solution.pkl")
SC_train = pd.read_pickle("dataset/SC_train_solution.pkl")
SC_test = pd.read_pickle("dataset/SC_test_solution.pkl")

BP_train = BP_train.sample(frac=1, random_state=seed)
BP_test = BP_test.sample(frac=1, random_state=seed)
SC_train = SC_train.sample(frac=1, random_state=seed)
SC_test = SC_test.sample(frac=1, random_state=seed)

train = pd.concat([BP_train, SC_train], ignore_index=True)
test = pd.concat([BP_test, SC_test], ignore_index=True)

X_train = train.drop(columns=['score'])
Y_train = train['score']

X_test = test.drop(columns=['score'])
Y_test = test['score']

X = pd.concat([X_train, X_test], ignore_index=True)
Y = pd.concat([Y_train, Y_test], ignore_index=True)

## Preprocessing

In [34]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA

train_PCA = PCA(n_components=2)
train_PCA.fit(X_train)
X_train_pca = train_PCA.transform(X_train)

plt.title('PCA train (all features)')
plt.scatter(X_train_pca[:,0], X_train_pca[:,1], alpha=0.5)
plt.show()

test_PCA = PCA(n_components=2)
test_PCA.fit(X_test)
X_test_pca = test_PCA.transform(X_test)
plt.title('PCA test (all features)')
plt.scatter(X_test_pca[:,0], X_test_pca[:,1], c="red", alpha=0.5)
plt.show()

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

params = {
    "n_estimators": [10, 20, 50, 100],
    "min_samples_leaf": [5, 10, 20, 30],
    "max_depth": [5, 10, 20],
}


scoring = (
    'r2',
    'neg_mean_squared_error',
    'neg_mean_absolute_error',
)
for train, test in kf.split(X, Y):
    X_train, X_test = X.iloc[train], X.iloc[test]
    Y_train, Y_test = Y.iloc[train], Y.iloc[test]

    et = ExtraTreesRegressor(random_state=seed)
    gs = GridSearchCV(et, param_grid=params, cv=5, scoring='neg_mean_absolute_error', n_jobs=4, return_train_score=True)
    gs.fit(X_train, Y_train)

    print("Best score: ", gs.best_score_)
    print("Best params: ", gs.best_params_)

    scores = gs.score(X_test, Y_test)
    print(f"Score: {scores:.4f}")