In [1]:
import time
import warnings
import numpy as np

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, matthews_corrcoef, precision_score, recall_score

np.random.seed(42)

warnings.filterwarnings("ignore")

start = time.time()


def scoring(model, x_test, y_test) -> tuple:
    ps = precision_score(y_test, model.predict(x_test))
    rs = recall_score(y_test, model.predict(x_test))
    fs = f1_score(y_test, model.predict(x_test))
    mcc = matthews_corrcoef(y_test, model.predict(x_test))
    return ps, rs, fs, mcc


def make_table():
    return pd.DataFrame(
        columns=[
            "Score",
            "Precision _score",
            "Recall_score",
            "F1_score",
            "Matthews_Corr_Coef",
        ]
    )


df = pd.read_csv(
    "/home/david/Documents/ARU/AvalancheProject/demo/load/balanced_cleaned.csv",
    usecols=[
        "Max Temp (°C)",
        "Total Rain (mm)",
        "Total Snow (cm)",
        "Total Precip (mm)",
        "Snow on Grnd (cm)",
        "avalanche",
    ],
)

X = df[df.columns[:-1]]
y = df[df.columns[-1]]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

estimators = [
    ("minmaxscaler", MinMaxScaler()),
    ("standardscaler", StandardScaler()),
    ("reduce_dim", PCA()),
    ("clf", RandomForestClassifier()),
]

pipe = Pipeline(estimators)

In [2]:
pipe

In [4]:
param_grid = dict(
    minmaxscaler=["passthrough", MinMaxScaler()],
    standardscaler=["passthrough", StandardScaler()],
    reduce_dim=["passthrough", PCA(2), PCA(4)],
    clf__n_estimators=[10, 30, 50, 100],
    clf__max_features=["sqrt", 0.25, 0.5, 0.75, 1.0],
    clf__max_depth=[4, 5, 6, 7, 8],
)

In [5]:
from sklearn.model_selection import ParameterGrid

pg = ParameterGrid(param_grid)
len(pg)

1200

In [8]:
pg.param_grid

[{'minmaxscaler': ['passthrough', MinMaxScaler()],
  'standardscaler': ['passthrough', StandardScaler()],
  'reduce_dim': ['passthrough', PCA(n_components=2), PCA(n_components=4)],
  'clf__n_estimators': [10, 30, 50, 100],
  'clf__max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
  'clf__max_depth': [4, 5, 6, 7, 8]}]

In [15]:
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring="recall")

In [16]:
grid_search.fit(X_train, y_train)

In [17]:
grid_search.best_estimator_

In [18]:
ps, rs, fs, mcc = scoring(grid_search.best_estimator_, X_test, y_test)

In [19]:
ps, rs, fs, mcc

(0.7575757575757576,
 0.8333333333333334,
 0.7936507936507938,
 0.5431054447620561)

In [20]:
grid_search.best_estimator_.score(X_test, y_test)

0.7719298245614035