In [1]:
import sklearn_crfsuite

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from helper.dataset_reader import read_tsv
from helper.features import *

# import warnings filter
from warnings import simplefilter

# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
data = read_tsv('../dataset/all-merged-data-140422.tsv')

In [3]:
X = []
y = []

for tokens, tags in data[0]:
    X.append(tokens)
    y.append(tags)

In [4]:
X_train_, X_test_, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [5]:
X_train = []
for token in X_train_:
    X_train.append(feature_extraction(tokens, 5))

In [6]:
X_train[0]

[{'n_gram_0': 'Mau',
  'is_alpha': True,
  'is_numeric': False,
  'is_capital': False,
  'contains_alpha': True,
  'contains_numeric': False,
  'contains_aphostrope': False},
 {'n_gram_0': 'diblock',
  'is_alpha': True,
  'is_numeric': False,
  'is_capital': False,
  'contains_alpha': True,
  'contains_numeric': False,
  'contains_aphostrope': False,
  'n_gram_1': 'ibloc'},
 {'n_gram_0': 'tapi',
  'is_alpha': True,
  'is_numeric': False,
  'is_capital': False,
  'contains_alpha': True,
  'contains_numeric': False,
  'contains_aphostrope': False},
 {'n_gram_0': 'masih',
  'is_alpha': True,
  'is_numeric': False,
  'is_capital': False,
  'contains_alpha': True,
  'contains_numeric': False,
  'contains_aphostrope': False},
 {'n_gram_0': 'temen',
  'is_alpha': True,
  'is_numeric': False,
  'is_capital': False,
  'contains_alpha': True,
  'contains_numeric': False,
  'contains_aphostrope': False},
 {'n_gram_0': 'tapi',
  'is_alpha': True,
  'is_numeric': False,
  'is_capital': False,
  'co

In [7]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # for gradient descent for optimization and getting model parameters
    c1=0.1, # Coefficient for Lasso (L1) regularization
    c2=0.1, # Coefficient for Ridge (L2) regularization
    max_iterations=100, # The maximum number of iterations for optimization algorithms, iteration for the gradient descent optimization
    all_possible_transitions=True   # Specify whether CRFsuite generates transition features that do not even occur in the training data
)

In [8]:
X_train = []
for tokens in X_train_:
    X_train.append(feature_extraction(tokens, 5))

#try:
crf.fit(X_train, y_train)
#except AttributeError:
#    pass

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [9]:
labels = ['ID', 'JV', 'EN', 'O', 'MIX-ID-EN', 'MIX-ID-JV', 'MIX-JV-EN']

In [10]:
X_test = []
for tokens in X_test_:
    X_test.append(feature_extraction(tokens, 5))

y_pred = crf.predict(X_test)
print('Testing Data Performance')
flat_y = [item for y_ in y_test for item in y_]
flat_y_pred = [item for y_pred_ in y_pred for item in y_pred_]
print(classification_report(flat_y, flat_y_pred, labels=labels))

Testing Data Performance
              precision    recall  f1-score   support

          ID       0.89      0.97      0.93     22317
          JV       0.86      0.83      0.85      4915
          EN       0.92      0.82      0.87      5005
           O       0.97      0.87      0.92     10161
   MIX-ID-EN       0.98      0.83      0.90       976
   MIX-ID-JV       0.98      0.83      0.90       289
   MIX-JV-EN       0.96      0.82      0.88       274

    accuracy                           0.91     43937
   macro avg       0.94      0.85      0.89     43937
weighted avg       0.91      0.91      0.91     43937



In [11]:
import eli5
eli5.show_weights(crf, horizontal_layout=False)


From \ To,EN,ID,JV,MIX-ID-EN,MIX-ID-JV,MIX-JV-EN,O
EN,1.547,-0.218,-1.051,-0.498,-2.497,-1.573,1.257
ID,-0.013,0.816,-1.166,0.297,-0.834,-1.672,1.328
JV,-0.856,-1.163,1.21,-2.617,-0.09,0.295,0.989
MIX-ID-EN,-0.34,0.328,-1.748,0.12,-1.721,-2.371,1.068
MIX-ID-JV,-1.242,-0.472,0.507,-2.907,-0.686,-0.33,0.685
MIX-JV-EN,-1.058,-1.398,0.408,-1.859,-1.174,-0.812,0.618
O,2.157,2.269,2.029,1.581,1.206,0.651,3.943

Weight?,Feature
+6.462,n_gram_0:hp
+6.205,n_gram_0:hape
+6.166,n_gram_0:error
+6.029,n_gram_0:well
+5.824,n_gram_0:chat
+5.646,n_gram_0:RT
+5.634,n_gram_0:event
+5.604,n_gram_0:dm
+5.592,n_gram_0:nder
+5.530,n_gram_0:switch

Weight?,Feature
+6.199,n_gram_0:dan
+5.767,n_gram_0:di
+5.764,n_gram_0:kan
+5.540,n_gram_0:ini
+5.227,n_gram_0:ada
+5.210,n_gram_0:juga
+5.187,n_gram_0:buat
+5.163,n_gram_0:itu
+5.147,n_gram_0:sama
+5.089,n_gram_0:tenang

Weight?,Feature
+6.513,n_gram_0:e
+6.180,n_gram_0:iki
+5.976,n_gram_0:nek
+5.850,n_gram_0:wis
+5.765,n_gram_0:wae
+5.614,n_gram_0:karo
+5.346,n_gram_0:ae
+5.262,n_gram_0:ono
+5.235,n_gram_0:iso
+5.189,n_gram_0:ra

Weight?,Feature
+7.265,n_gram_0:dicopy
+6.564,n_gram_0:ngepin
+6.541,n_gram_0:dicut
+6.484,n_gram_0:dilist
+6.437,n_gram_0:ngeadd
+6.412,n_gram_0:dikick
+6.167,n_gram_0:ngelag
+6.107,n_gram_0:dimute
+6.066,n_gram_0:diskip
+6.031,n_gram_0:diacc

Weight?,Feature
+7.257,n_gram_0:soale
+7.165,n_gram_0:jadine
+6.963,n_gram_0:minume
+6.930,n_gram_0:uange
+6.824,n_gram_0:beline
+6.486,n_gram_0:carine
+6.355,n_gram_0:bajune
+6.318,n_gram_0:haruse
+6.200,n_gram_0:hapuse
+5.838,n_gram_0:Haruse

Weight?,Feature
+8.207,n_gram_0:hpne
+7.065,n_gram_0:nyepam
+6.878,n_gram_0:dicopy
+6.673,n_gram_0:kesave
+6.239,n_gram_0:dikick
+6.235,n_gram_0:Hpne
+6.121,n_gram_0:usere
+5.770,n_gram_0:discan
+5.281,n_gram_0:HPne
+5.253,n_gram_0:ditake

Weight?,Feature
+7.797,n_gram_0:wkwk
+7.094,n_gram_0:tokped
+7.087,n_gram_0:wkwkwk
+6.570,n_gram_0:.
+6.235,n_gram_0:haha
+6.084,n_gram_0:ㅤ
+6.082,"n_gram_0:,"
+5.840,n_gram_0:shopee
+5.804,n_gram_0:ig
+5.527,n_gram_0:wkwkw
