In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import os

import sys; sys.path.append("../../")
from modules.utils import load_yaml, load_pkl, make_directory, save_pkl, save_yaml

# Read Data

Note: in the submission, we cannot use `test.csv`. This has no labels and it is for submission only. So we need to use validation data as the test

--> This should be OK since the data size seems large enough

In [7]:
TRAIN_CONFIG_PATH = '../../config/train_config.yaml'
DATA_PATH = '../../data/01_split/'

config = load_yaml(TRAIN_CONFIG_PATH)
LABEL_ENCODING = config['LABEL_ENCODING']

train_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
valid_df = pd.read_csv(os.path.join(DATA_PATH, 'valid.csv'))

../../config/train_config.yaml


## Split into train and test

As said, we will use val and test as the same 

In [15]:
train_X, train_y = train_df.loc[:,train_df.columns!='leaktype'], train_df['leaktype']
valid_X, valid_y = valid_df.loc[:,train_df.columns!='leaktype'], valid_df['leaktype']

train_y = train_y.replace(LABEL_ENCODING)
valid_y = valid_y.replace(LABEL_ENCODING)

# Same testing and validation
test_X, test_y = valid_X, valid_y

# Train Model

In [9]:
rf = RandomForestClassifier(max_features = 500,n_estimators = 600,max_depth = 3,min_samples_split = 2,max_leaf_nodes = 30, random_state=123)
rf.fit(train_X, train_y)

RandomForestClassifier(max_depth=3, max_features=500, max_leaf_nodes=30,
                       n_estimators=600, random_state=123)

In [44]:
# !pip install pycaret

In [41]:
# XG

import xgboost

xgb = xgboost.XGBClassifier(verbose=1)
xgb.fit(train_X, train_y)

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




KeyboardInterrupt: 

# Prediction

In [20]:
pred_y = rf.predict(test_X)

# Metrics evaluation

We need to get a few metrics right

In [25]:
encoding_to_label = {v: k for k, v in LABEL_ENCODING.items()}

{0: 'other', 1: 'noise', 2: 'normal', 3: 'in', 4: 'out'}


In [36]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, multilabel_confusion_matrix, plot_roc_curve


accuracy = accuracy_score(test_y, pred_y)
print(f'Mean accuracy score: {accuracy:.3}')

labels = [key for key in LABEL_ENCODING.keys()]

f1 = f1_score(test_y, pred_y, average='macro')
print('f1 score :', f1)

print(classification_report(test_y, pred_y, target_names=labels))

Mean accuracy score: 0.681
f1 score : 0.3531721582130049
              precision    recall  f1-score   support

       other       0.47      0.58      0.52      1415
       noise       0.55      0.29      0.38      1011
      normal       0.78      0.97      0.86      3578
          in       0.00      0.00      0.00       369
         out       0.00      0.00      0.00       347

    accuracy                           0.68      6720
   macro avg       0.36      0.37      0.35      6720
weighted avg       0.60      0.68      0.63      6720



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
multilabel_confusion_matrix(test_y, pred_y)

array([[[4367,  938],
        [ 588,  827]],

       [[5470,  239],
        [ 716,  295]],

       [[2176,  966],
        [ 123, 3455]],

       [[6351,    0],
        [ 369,    0]],

       [[6373,    0],
        [ 347,    0]]])