In [14]:
import sys
sys.path.append('..')
from src import vectorize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [15]:
target_names = ['PERIOD', 'QMARK', 'EXPOINT']

In [10]:
X, y = vectorize.tokens_to_bag_of_words('../data/processed/merged_tok.txt', one_hot_y=False)
train, dev, test = vectorize.train_dev_test_split(X, y, 0.05, 0.05)

In [11]:
x_train, y_train = train
y_train = y_train.ravel()

x_dev, y_dev = dev
y_dev = y_dev.ravel()

In [26]:
clf = RandomForestClassifier(n_estimators=100, random_state=229)
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=229, verbose=0, warm_start=False)

In [27]:
y_pred = clf.predict(x_dev)

In [28]:
print(classification_report(y_dev, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      PERIOD       0.86      0.99      0.92      1942
       QMARK       0.71      0.27      0.39       198
     EXPOINT       0.65      0.22      0.33       230

   micro avg       0.85      0.85      0.85      2370
   macro avg       0.74      0.49      0.55      2370
weighted avg       0.83      0.85      0.82      2370



In [29]:
confusion_matrix(y_dev, y_pred)

array([[1914,   10,   18],
       [ 135,   54,    9],
       [ 167,   12,   51]])

In [30]:
print(classification_report(y_dev, np.zeros_like(y_dev), target_names=target_names))

              precision    recall  f1-score   support

      PERIOD       0.82      1.00      0.90      1942
       QMARK       0.00      0.00      0.00       198
     EXPOINT       0.00      0.00      0.00       230

   micro avg       0.82      0.82      0.82      2370
   macro avg       0.27      0.33      0.30      2370
weighted avg       0.67      0.82      0.74      2370



  'precision', 'predicted', average, warn_for)


In [31]:
confusion_matrix(y_dev, np.zeros_like(y_dev))

array([[1942,    0,    0],
       [ 198,    0,    0],
       [ 230,    0,    0]])

In [32]:
guesses = [0]*1942 + [1]*198 + [2]*230

In [33]:
from random import shuffle

In [34]:
shuffle(guesses)

In [35]:
guesses[:100]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [36]:
print(classification_report(y_dev, guesses, target_names=target_names))
confusion_matrix(y_dev, guesses)

              precision    recall  f1-score   support

      PERIOD       0.82      0.82      0.82      1942
       QMARK       0.10      0.10      0.10       198
     EXPOINT       0.09      0.09      0.09       230

   micro avg       0.69      0.69      0.69      2370
   macro avg       0.34      0.34      0.34      2370
weighted avg       0.69      0.69      0.69      2370



array([[1589,  155,  198],
       [ 166,   20,   12],
       [ 187,   23,   20]])

In [37]:
print(classification_report(y_dev, np.zeros_like(y_dev), target_names=target_names))

              precision    recall  f1-score   support

      PERIOD       0.82      1.00      0.90      1942
       QMARK       0.00      0.00      0.00       198
     EXPOINT       0.00      0.00      0.00       230

   micro avg       0.82      0.82      0.82      2370
   macro avg       0.27      0.33      0.30      2370
weighted avg       0.67      0.82      0.74      2370



  'precision', 'predicted', average, warn_for)
