In [1]:
import sklearn_crfsuite

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from helper.dataset_reader import read_tsv
from helper.features import feature_extraction_basic

# import warnings filter
from warnings import simplefilter

# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
!pip install -U 'scikit-learn<0.24'

You should consider upgrading via the '/Users/ahmadfathanhidayatullah/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
data = read_tsv('../raw dataset/all-tagged-090422-merged.tsv')

In [4]:
X = []
y = []

for tokens, tags in data[0]:
    X.append(tokens)
    y.append(tags)

In [5]:
X_train_, X_test_, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [6]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # for gradient descent for optimization and getting model parameters
    c1=0.1, # Coefficient for Lasso (L1) regularization
    c2=0.1, # Coefficient for Ridge (L2) regularization
    max_iterations=100, # The maximum number of iterations for optimization algorithms, iteration for the gradient descent optimization
    all_possible_transitions=True   # Specify whether CRFsuite generates transition features that do not even occur in the training data
)

In [7]:
X_train = []
for tokens in X_train_:
    X_train.append(feature_extraction_basic(tokens, 5))

#try:
crf.fit(X_train, y_train)
#except AttributeError:
#    pass

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [8]:
labels = ['ID', 'JV', 'EN', 'NE', 'O', 'MIX-ID-EN', 'MIX-ID-JV', 'MIX-JV-EN']

In [9]:
X_test = []
for tokens in X_test_:
    X_test.append(feature_extraction_basic(tokens, 5))

y_pred = crf.predict(X_test)
print('Testing Data Performance')
flat_y = [item for y_ in y_test for item in y_]
flat_y_pred = [item for y_pred_ in y_pred for item in y_pred_]
print(classification_report(flat_y, flat_y_pred, labels=labels))

Testing Data Performance
              precision    recall  f1-score   support

          ID       0.88      0.97      0.92     14891
          JV       0.85      0.85      0.85      4294
          EN       0.93      0.68      0.79      2076
          NE       0.90      0.38      0.54       900
           O       0.97      0.94      0.96      5909
   MIX-ID-EN       0.99      0.85      0.91       771
   MIX-ID-JV       0.98      0.82      0.89       256
   MIX-JV-EN       0.95      0.82      0.88       227

    accuracy                           0.90     29324
   macro avg       0.93      0.79      0.84     29324
weighted avg       0.90      0.90      0.90     29324



In [15]:
import eli5
eli5.show_weights(crf)


From \ To,EN,ID,JV,MIX-ID-EN,MIX-ID-JV,MIX-JV-EN,NE,O
EN,1.23,0.038,-0.609,-0.388,-2.0,-1.511,0.099,1.608
ID,0.093,0.95,-0.917,0.596,-0.491,-1.561,0.229,1.744
JV,-0.439,-0.836,1.45,-2.1,0.236,0.56,0.017,1.61
MIX-ID-EN,-0.611,0.19,-2.145,-0.365,-1.838,-2.555,-0.611,1.307
MIX-ID-JV,-1.44,-0.551,0.348,-2.802,-0.523,-0.652,-0.417,0.724
MIX-JV-EN,-0.634,-1.004,0.748,-2.431,-0.793,-0.308,0.04,1.349
NE,-0.521,-0.032,-0.279,-0.863,-1.013,-2.218,1.594,1.686
O,1.925,2.342,2.152,1.683,1.457,1.064,1.857,4.494

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7
+6.331,n_gram_0:hp,,,,,,
+6.176,n_gram_0:error,,,,,,
+6.158,n_gram_0:hape,,,,,,
+5.945,n_gram_0:well,,,,,,
+5.796,n_gram_0:nder,,,,,,
+5.767,n_gram_0:event,,,,,,
+5.754,n_gram_0:chat,,,,,,
+5.591,n_gram_0:break,,,,,,
+5.557,n_gram_0:dm,,,,,,
+5.549,n_gram_0:RT,,,,,,

Weight?,Feature
+6.331,n_gram_0:hp
+6.176,n_gram_0:error
+6.158,n_gram_0:hape
+5.945,n_gram_0:well
+5.796,n_gram_0:nder
+5.767,n_gram_0:event
+5.754,n_gram_0:chat
+5.591,n_gram_0:break
+5.557,n_gram_0:dm
+5.549,n_gram_0:RT

Weight?,Feature
+5.861,n_gram_0:dan
+5.685,n_gram_0:kan
+5.338,n_gram_0:film
+5.205,n_gram_0:ini
+4.970,n_gram_0:jadul
+4.937,n_gram_0:ada
+4.919,n_gram_0:juga
+4.878,n_gram_0:aja
+4.853,n_gram_0:pernah
+4.816,n_gram_0:yg

Weight?,Feature
+6.417,n_gram_0:e
+5.893,n_gram_0:iki
+5.835,n_gram_0:sing
+5.712,n_gram_0:nek
+5.699,n_gram_0:wis
+5.489,n_gram_0:wae
+5.482,n_gram_0:yo
+5.455,n_gram_0:karo
+5.256,n_gram_0:ae
+5.190,n_gram_0:iso

Weight?,Feature
+7.092,n_gram_0:dicopy
+6.794,n_gram_0:dikick
+6.729,n_gram_0:dicut
+6.495,n_gram_0:dilist
+6.373,n_gram_0:ngeadd
+6.110,n_gram_0:ngelag
+6.072,n_gram_0:accnya
+6.013,n_gram_0:dimute
+5.892,n_gram_0:diskip
+5.823,n_gram_0:disave

Weight?,Feature
+7.359,n_gram_0:beline
+7.230,n_gram_0:jadine
+6.949,n_gram_0:minume
+6.801,n_gram_0:uange
+6.730,n_gram_0:soale
+6.668,n_gram_0:hapuse
+6.577,n_gram_0:bajune
+6.559,n_gram_0:carine
+4.823,n_gram_0:Jadine
+4.566,n_gram_0:Beline

Weight?,Feature
+8.063,n_gram_0:hpne
+7.131,n_gram_0:nyepam
+6.837,n_gram_0:Hpne
+6.792,n_gram_0:kesave
+6.565,n_gram_0:dicopy
+6.242,n_gram_0:usere
+5.972,n_gram_0:HPne
+5.823,n_gram_0:discan
+5.730,n_gram_0:dikick
+5.168,n_gram_0:ditake

Weight?,Feature
+7.265,n_gram_0:tokped
+6.068,n_gram_0:shopee
+6.002,n_gram_0:ig
+5.648,n_gram_0:wa
+5.506,n_gram_0:twt
+5.355,n_gram_0:Tokped
+5.152,n_gram_0:exo-l
+5.066,n_gram_0:xl
+5.015,n_gram_0:XL
+4.966,n_gram_0:F1

Weight?,Feature
+8.980,n_gram_0:wkwk
+8.541,n_gram_0:wkwkwk
+7.558,n_gram_0:ㅤ
+6.903,n_gram_0:wkwkw
+6.852,n_gram_0:haha
+6.557,n_gram_0:hehe
+6.189,n_gram_0:.
+6.105,"n_gram_0:,"
+5.647,n_gram_0:hahaha
+5.477,n_gram_1:ahaha
