In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from dataset_preparation import unpack_labels
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

In [34]:
train_x, train_y = joblib.load('ewt/train_ewt_two.sav')
dev_x, dev_y = joblib.load('ewt/dev_ewt_two.sav')
eval_x, eval_y = joblib.load('ewt/eval_ewt_two.sav')

In [35]:
print(train_x.isnull().values.any())
print(dev_x.isnull().values.any())
print(eval_x.isnull().values.any())

True
True
False


In [36]:
train_y = unpack_labels(train_y)
dev_y = unpack_labels(dev_y)
eval_y = unpack_labels(eval_y)

In [37]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
])
train_x = pipeline.fit_transform(train_x)
dev_x = pipeline.transform(dev_x)

In [17]:
param_grid = {
    'n_estimators': [100, 200, 300], 'class_weight': ['balanced'],
    'max_features': ['sqrt', 'log2'], 'random_state': [42]
}

In [18]:
forest_model = RandomForestClassifier()
grid_search_md = GridSearchCV(forest_model, param_grid, cv=5, scoring='accuracy')
grid_search_md.fit(train_x, train_y)

In [19]:
final_model = grid_search_md.best_estimator_
print(grid_search_md.best_params_)

{'class_weight': 'balanced', 'max_features': 'log2', 'n_estimators': 200, 'random_state': 42}


In [21]:
pred = final_model.predict(dev_x)
report = classification_report(dev_y, pred, digits=4, zero_division=0)

In [22]:
print(report)

              precision    recall  f1-score   support

        absz     0.7309    0.7309    0.7309       327
        bckg     0.2908    1.0000    0.4506      6953
        cpsz     0.0000    0.0000    0.0000        34
        fnsz     0.0000    0.0000    0.0000      3999
        gnsz     1.0000    0.0051    0.0101     12998
        tcsz     0.0000    0.0000    0.0000        47
        tnsz     0.0000    0.0000    0.0000        21

    accuracy                         0.2977     24379
   macro avg     0.2888    0.2480    0.1702     24379
weighted avg     0.6259    0.2977    0.1437     24379



In [38]:
no_opt_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced', max_features='log2')
no_opt_model.fit(train_x, train_y)

In [39]:
pred = no_opt_model.predict(dev_x)
report = classification_report(dev_y, pred, digits=4, zero_division=0)
print(report)

              precision    recall  f1-score   support

        absz     0.8445    0.7309    0.7836       327
        bckg     0.2886    1.0000    0.4479      6953
        cpsz     0.0000    0.0000    0.0000        34
        fnsz     0.0000    0.0000    0.0000      3999
        gnsz     0.0000    0.0000    0.0000     12998
        tcsz     0.0000    0.0000    0.0000        47
        tnsz     0.0000    0.0000    0.0000        21

    accuracy                         0.2950     24379
   macro avg     0.1619    0.2473    0.1759     24379
weighted avg     0.0936    0.2950    0.1382     24379

