In [1]:
import sklearn_crfsuite
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from helper.dataset_reader import read_tsv
from helper.features import *

# import warnings filter
from warnings import simplefilter

# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
data = read_tsv('../dataset/all-merged-data-140422.tsv')

In [10]:
data

([[['Tumben',
    'xl',
    'banter',
    'go',
    'download',
    ',',
    ',',
    'download',
    'video',
    'tom',
    'jerry',
    'ya',
    'masuk',
    'kie',
    'https://t.co/SmrXmut7wk'],
   ['ID',
    'O',
    'JV',
    'JV',
    'EN',
    'O',
    'O',
    'EN',
    'ID',
    'O',
    'O',
    'ID',
    'ID',
    'JV',
    'O']],
  [['@myXL',
    '@myXLCare',
    'knp',
    'xl',
    'skr',
    'jd',
    'susah',
    'signal',
    'ya',
    ',',
    'dan',
    'download',
    'paling',
    'banter',
    '70kbps',
    '.',
    'Sangat',
    'disayangkan'],
   ['O',
    'O',
    'ID',
    'O',
    'ID',
    'ID',
    'ID',
    'EN',
    'ID',
    'O',
    'ID',
    'EN',
    'ID',
    'ID',
    'EN',
    'O',
    'ID',
    'ID']],
  [['Lak',
    'download',
    'nggawe',
    'cl',
    'iku',
    'subuh',
    'baru',
    'banter',
    '.',
    'Lak',
    'sore-bengi',
    'lemot',
    "''",
    '@kecepoood',
    ':',
    'XL',
    'labil',
    'donlod',
    'munggah',
    '

In [7]:
#dt, all_words, all_tags = data
dt, all_words, all_tags = data

In [4]:
# Convert to dataframe [Tweet | Tags]
df_tweet_tags = pd.DataFrame(dt, columns=['Tweets','Tags'])
# df_tweet_tags.head()

In [5]:
token_tag_pair = []

for index, row in df_tweet_tags.iterrows():
    pair = list(zip(row['Tweets'], row['Tags']))
    token_tag_pair.append(pair)

token_tag_pair

[[('Tumben', 'ID'),
  ('xl', 'O'),
  ('banter', 'ID'),
  ('go', 'JV'),
  ('download', 'EN'),
  (',', 'O'),
  (',', 'O'),
  ('download', 'EN'),
  ('video', 'ID'),
  ('tom', 'O'),
  ('jerry', 'O'),
  ('ya', 'ID'),
  ('masuk', 'ID'),
  ('kie', 'JV'),
  ('https://t.co/SmrXmut7wk', 'O')],
 [('@myXL', 'O'),
  ('@myXLCare', 'O'),
  ('knp', 'ID'),
  ('xl', 'O'),
  ('skr', 'ID'),
  ('jd', 'ID'),
  ('susah', 'ID'),
  ('signal', 'EN'),
  ('ya', 'ID'),
  (',', 'O'),
  ('dan', 'ID'),
  ('download', 'EN'),
  ('paling', 'ID'),
  ('banter', 'ID'),
  ('70kbps', 'EN'),
  ('.', 'O'),
  ('Sangat', 'ID'),
  ('disayangkan', 'ID')],
 [('Lak', 'JV'),
  ('download', 'EN'),
  ('nggawe', 'JV'),
  ('cl', 'O'),
  ('iku', 'JV'),
  ('subuh', 'ID'),
  ('baru', 'ID'),
  ('banter', 'ID'),
  ('.', 'O'),
  ('Lak', 'JV'),
  ('sore-bengi', 'JV'),
  ('lemot', 'JV'),
  ("''", 'O'),
  ('@kecepoood', 'O'),
  (':', 'O'),
  ('XL', 'O'),
  ('labil', 'ID'),
  ('donlod', 'EN'),
  ('munggah', 'JV'),
  ('mudun', 'JV'),
  ('😩', 'O'),


In [6]:
def sent2features(sent):
    return [token2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, label in sent]


def sent2tokens(sent):
    return [token for token, label in sent]

In [7]:
X = [sent2features(s) for s in token_tag_pair]
y = [sent2labels(s) for s in token_tag_pair]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=0)

In [8]:
X_train[20]

[{'n_gram_0': '@sudjiwotedjo',
  'token.lower': '@sudjiwotedjo',
  'token.prefix_2': '@s',
  'token.prefix_3': '@su',
  'token.suffix_2': 'jo',
  'token.suffix_3': 'djo',
  'token.length': 13,
  'token.is_alpha': False,
  'token.is_numeric': False,
  'token.is_capital': False,
  'token.is_title': False,
  'token.startswith_symbols': True,
  'token.contains_numeric': False,
  'token.contains_capital': False,
  'token.contains_quotes': False,
  'token.contains_hyphen': False,
  'BOS': True,
  '+1:token.lower': 'mbah',
  '+1:token.is_alpha': True,
  '+1:token.is_numeric': False,
  '+1:token.is_capital': False,
  '+1:token.is_title': True,
  '+1:token.startswith_symbols': False,
  '+1:token.contains_numeric': False,
  '+1:token.contains_capital': True,
  '+1:token.contains_quotes': False,
  '+1:token.contains_hyphen': False,
  '+1:tag': 'JV'},
 {'n_gram_0': 'Mba',
  'token.lower': 'mbah',
  'token.prefix_2': 'Mb',
  'token.prefix_3': 'Mba',
  'token.suffix_2': 'ah',
  'token.suffix_3': 'ba

In [9]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # for gradient descent for optimization and getting model parameters
    c1=0.1, # Coefficient for Lasso (L1) regularization
    c2=0.1, # Coefficient for Ridge (L2) regularization
    max_iterations=100, # The maximum number of iterations for optimization algorithms, iteration for the gradient descent optimization
    all_possible_transitions=True   # Specify whether CRFsuite generates transition features that do not even occur in the training data
)

In [10]:
%%time
crf.fit(X_train, y_train)

CPU times: user 13.8 s, sys: 284 ms, total: 14.1 s
Wall time: 14.2 s


CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [None]:
labels = ['ID', 'JV', 'EN', 'O', 'MIX-ID-EN', 'MIX-ID-JV', 'MIX-JV-EN']

In [None]:
y_pred = crf.predict(X_test)

In [None]:
flat_y = [item for y_ in y_test for item in y_]
flat_y_pred = [item for y_pred_ in y_pred for item in y_pred_]
print(classification_report(flat_y, flat_y_pred, labels=labels))

In [None]:
import eli5
eli5.show_weights(crf)

In [8]:
from scipy.stats import expon

expon.stats(scale=0.05)

(array(0.05), array(0.0025))