In [1]:
import sklearn_crfsuite
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from helper.dataset_reader import read_tsv
from helper.features import *

# import warnings filter
from warnings import simplefilter

# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
data = read_tsv('../dataset/all-merged-data-140422.tsv')

In [3]:
dt, all_words, all_tags = data

In [4]:
# Convert to dataframe [Tweet | Tags]
df_tweet_tags = pd.DataFrame(dt, columns=['Tweets','Tags'])
# df_tweet_tags.head()

In [5]:
token_tag_pair = []

for index, row in df_tweet_tags.iterrows():
    pair = list(zip(row['Tweets'], row['Tags']))
    token_tag_pair.append(pair)

token_tag_pair

[[('Tumben', 'ID'),
  ('xl', 'O'),
  ('banter', 'ID'),
  ('go', 'JV'),
  ('download', 'EN'),
  (',', 'O'),
  (',', 'O'),
  ('download', 'EN'),
  ('video', 'ID'),
  ('tom', 'O'),
  ('jerry', 'O'),
  ('ya', 'ID'),
  ('masuk', 'ID'),
  ('kie', 'JV'),
  ('https://t.co/SmrXmut7wk', 'O')],
 [('@myXL', 'O'),
  ('@myXLCare', 'O'),
  ('knp', 'ID'),
  ('xl', 'O'),
  ('skr', 'ID'),
  ('jd', 'ID'),
  ('susah', 'ID'),
  ('signal', 'EN'),
  ('ya', 'ID'),
  (',', 'O'),
  ('dan', 'ID'),
  ('download', 'EN'),
  ('paling', 'ID'),
  ('banter', 'ID'),
  ('70kbps', 'EN'),
  ('.', 'O'),
  ('Sangat', 'ID'),
  ('disayangkan', 'ID')],
 [('Lak', 'JV'),
  ('download', 'EN'),
  ('nggawe', 'JV'),
  ('cl', 'O'),
  ('iku', 'JV'),
  ('subuh', 'ID'),
  ('baru', 'ID'),
  ('banter', 'ID'),
  ('.', 'O'),
  ('Lak', 'JV'),
  ('sore-bengi', 'JV'),
  ('lemot', 'JV'),
  ("''", 'O'),
  ('@kecepoood', 'O'),
  (':', 'O'),
  ('XL', 'O'),
  ('labil', 'ID'),
  ('donlod', 'EN'),
  ('munggah', 'JV'),
  ('mudun', 'JV'),
  ('😩', 'O'),


In [6]:
def sent2features(sent):
    return [token2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, label in sent]


def sent2tokens(sent):
    return [token for token, label in sent]

In [7]:
X = [sent2features(s) for s in token_tag_pair]
y = [sent2labels(s) for s in token_tag_pair]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=0)

In [8]:
X_train[20]

[{'token.tag': 'O',
  'n_gram_0': '@sudjiwotedjo',
  'token.lower': '@sudjiwotedjo',
  'token.prefix_2': '@s',
  'token.prefix_3': '@su',
  'token.suffix_2': 'jo',
  'token.suffix_3': 'djo',
  'token.length': 13,
  'token.is_alpha': False,
  'token.is_numeric': False,
  'token.is_capital': False,
  'token.is_title': False,
  'token.startswith_symbols': True,
  'token.contains_numeric': False,
  'token.contains_capital': False,
  'token.contains_quotes': False,
  'token.contains_hyphen': False,
  'BOS': True,
  '+1:token.lower': 'mbah',
  '+1:token.is_alpha': True,
  '+1:token.is_numeric': False,
  '+1:token.is_capital': False,
  '+1:token.is_title': True,
  '+1:token.startswith_symbols': False,
  '+1:token.contains_numeric': False,
  '+1:token.contains_capital': True,
  '+1:token.contains_quotes': False,
  '+1:token.contains_hyphen': False,
  '+1:tag': 'JV'},
 {'token.tag': 'JV',
  'n_gram_0': 'Mba',
  'token.lower': 'mbah',
  'token.prefix_2': 'Mb',
  'token.prefix_3': 'Mba',
  'token

In [9]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # for gradient descent for optimization and getting model parameters
    c1=0.1, # Coefficient for Lasso (L1) regularization
    c2=0.1, # Coefficient for Ridge (L2) regularization
    max_iterations=100, # The maximum number of iterations for optimization algorithms, iteration for the gradient descent optimization
    all_possible_transitions=True   # Specify whether CRFsuite generates transition features that do not even occur in the training data
)

In [10]:
%%time
crf.fit(X_train, y_train)

CPU times: user 12.8 s, sys: 73.6 ms, total: 12.9 s
Wall time: 12.9 s


CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [11]:
labels = ['ID', 'JV', 'EN', 'O', 'MIX-ID-EN', 'MIX-ID-JV', 'MIX-JV-EN']

In [12]:
y_pred = crf.predict(X_test)

In [13]:
flat_y = [item for y_ in y_test for item in y_]
flat_y_pred = [item for y_pred_ in y_pred for item in y_pred_]
print(classification_report(flat_y, flat_y_pred, labels=labels))

              precision    recall  f1-score   support

          ID       1.00      1.00      1.00     26887
          JV       1.00      1.00      1.00      5865
          EN       1.00      1.00      1.00      6185
           O       1.00      1.00      1.00     12090
   MIX-ID-EN       1.00      1.00      1.00      1173
   MIX-ID-JV       1.00      1.00      1.00       354
   MIX-JV-EN       1.00      1.00      1.00       330

    accuracy                           1.00     52884
   macro avg       1.00      1.00      1.00     52884
weighted avg       1.00      1.00      1.00     52884



In [14]:
import eli5
eli5.show_weights(crf)

From \ To,EN,ID,JV,MIX-ID-EN,MIX-ID-JV,MIX-JV-EN,O
EN,0.663,0.002,-0.143,0.0,-0.132,0.0,0.189
ID,0.033,0.407,-0.335,0.231,-0.069,-0.176,0.132
JV,-0.125,-0.286,0.58,-0.339,0.261,0.468,0.023
MIX-ID-EN,0.0,0.093,-0.319,0.0,0.0,0.0,0.0
MIX-ID-JV,-0.047,-0.012,0.308,0.0,0.0,0.0,-0.062
MIX-JV-EN,0.0,-0.132,0.338,0.0,0.0,0.0,0.0
O,0.11,0.157,0.026,-0.022,-0.0,-0.102,0.661

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+9.530,token.tag:EN,,,,,
+0.556,token.is_alpha,,,,,
+0.281,+1:tag:EN,,,,,
+0.245,-1:tag:EN,,,,,
+0.094,token.length,,,,,
-0.015,token.suffix_3:nya,,,,,
-0.030,+1:tag:JV,,,,,
-0.054,-1:tag:JV,,,,,
-0.071,token.suffix_2:ya,,,,,
-0.253,token.prefix_2:di,,,,,

Weight?,Feature
9.53,token.tag:EN
0.556,token.is_alpha
0.281,+1:tag:EN
0.245,-1:tag:EN
0.094,token.length
-0.015,token.suffix_3:nya
-0.03,+1:tag:JV
-0.054,-1:tag:JV
-0.071,token.suffix_2:ya
-0.253,token.prefix_2:di

Weight?,Feature
10.979,token.tag:ID
0.569,token.is_alpha
0.089,token.length
0.065,token.suffix_2:an
0.034,+1:tag:ID
0.011,+1:token.is_alpha
-0.039,token.prefix_2:di
-0.138,token.suffix_2:ne

Weight?,Feature
9.41,token.tag:JV
0.892,token.is_alpha
0.175,-1:tag:JV
0.162,+1:tag:JV
0.086,token.suffix_2:ng
0.028,token.length
0.027,+1:token.is_alpha
0.016,+1:tag:O
0.003,-1:tag:O
0.0,-1:token.is_alpha

Weight?,Feature
7.623,token.tag:MIX-ID-EN
1.077,token.prefix_2:di
0.532,token.suffix_3:nya
0.47,token.suffix_2:ya
0.378,n_gram_0:nge
0.378,token.prefix_3:nge
0.321,token.prefix_2:ng
0.243,token.length
0.017,+1:tag:ID
-0.048,+1:tag:O

Weight?,Feature
+7.179,token.tag:MIX-ID-JV
+0.782,token.suffix_2:ne
+0.436,token.suffix_2:ke
+0.229,token.suffix_3:ale
+0.223,token.length
+0.162,token.suffix_2:le
+0.117,token.suffix_3:ane
+0.110,token.prefix_2:pe
+0.106,token.suffix_3:ine
+0.059,-1:tag:JV

Weight?,Feature
+6.953,token.tag:MIX-JV-EN
+0.569,token.prefix_2:di
+0.320,token.suffix_2:ke
+0.273,token.suffix_2:ne
+0.223,token.lower:hpne
+0.217,token.length
+0.207,-1:tag:JV
+0.150,token.suffix_3:pne
+0.134,token.prefix_2:ng
+0.133,+1:tag:JV

Weight?,Feature
+9.959,token.tag:O
+0.614,-1:token.is_alpha
+0.493,EOS
+0.371,token.startswith_symbols
+0.316,BOS
+0.273,+1:token.contains_capital
+0.265,token.contains_capital
+0.254,+1:token.is_title
+0.229,+1:token.is_alpha
+0.159,"token.suffix_3:,"
