In [1]:
import sklearn_crfsuite
import pickle
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn_crfsuite import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from helper.dataset_reader import read_tsv
from helper.features import feature_extraction_basic, feature_extraction_added

# import warnings filter
from warnings import simplefilter

# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [5]:
data = read_tsv('../raw dataset/all-tagged-090422-merged.tsv')

In [6]:
X = []
y = []

for tokens, tags in data[0]:
    X.append(tokens)
    y.append(tags)

In [8]:
X_train_, X_test_, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [9]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # for gradient descent for optimization and getting model parameters
    c1=0.1, # Coefficient for Lasso (L1) regularization
    c2=0.1, # Coefficient for Ridge (L2) regularization
    max_iterations=100, # The maximum number of iterations for optimization algorithms, iteration for the gradient descent optimization
    all_possible_transitions=True   # Specify whether CRFsuite generates transition features that do not even occur in the training data
)

In [10]:
X_train = []
for tokens in X_train_:
    X_train.append(feature_extraction_basic(tokens, 5))

try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [11]:
labels = ['ID', 'JV', 'EN', 'NE', 'O', 'MIX-ID-EN', 'MIX-ID-JV', 'MIX-JV-EN']

In [12]:
X_test = []
for tokens in X_test_:
    X_test.append(feature_extraction_basic(tokens, 5))

y_pred = crf.predict(X_test)
print('Testing Data Performance')
print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

Testing Data Performance
              precision    recall  f1-score   support

          ID       0.88      0.97      0.92     14891
          JV       0.85      0.85      0.85      4294
          EN       0.93      0.68      0.79      2076
          NE       0.90      0.38      0.54       900
           O       0.97      0.94      0.96      5909
   MIX-ID-EN       0.99      0.85      0.91       771
   MIX-ID-JV       0.98      0.82      0.89       256
   MIX-JV-EN       0.95      0.82      0.88       227

    accuracy                           0.90     29324
   macro avg       0.93      0.79      0.84     29324
weighted avg       0.90      0.90      0.90     29324



In [14]:
import eli5
eli5.show_weights(crf)

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'