In [1]:
from helper.dataset_reader import read_tsv
from helper.data_transformer import *
from langid_crf import *
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics

In [2]:
langid = LanguageIdentifier()

In [3]:
## Hyperparameter Optimization
raw_data = read_tsv('../dataset/comlid-data-140422-v1.tsv')
data = to_token_tag_list(raw_data)
X = [langid.sent2features(s) for s in data]
y = [langid.sent2tags(s) for s in data]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [6]:
data

[[('Tumben', 'ID'),
  ('xl', 'O'),
  ('banter', 'JV'),
  ('go', 'JV'),
  ('download', 'EN'),
  (',', 'O'),
  (',', 'O'),
  ('download', 'EN'),
  ('video', 'ID'),
  ('tom', 'O'),
  ('jerry', 'O'),
  ('ya', 'ID'),
  ('masuk', 'ID'),
  ('kie', 'JV'),
  ('https://t.co/SmrXmut7wk', 'O')],
 [('@myXL', 'O'),
  ('@myXLCare', 'O'),
  ('knp', 'ID'),
  ('xl', 'O'),
  ('skr', 'ID'),
  ('jd', 'ID'),
  ('susah', 'ID'),
  ('signal', 'EN'),
  ('ya', 'ID'),
  (',', 'O'),
  ('dan', 'ID'),
  ('download', 'EN'),
  ('paling', 'ID'),
  ('banter', 'ID'),
  ('70kbps', 'EN'),
  ('.', 'O'),
  ('Sangat', 'ID'),
  ('disayangkan', 'ID')],
 [('Lak', 'JV'),
  ('download', 'EN'),
  ('nggawe', 'JV'),
  ('cl', 'O'),
  ('iku', 'JV'),
  ('subuh', 'ID'),
  ('baru', 'ID'),
  ('banter', 'ID'),
  ('.', 'O'),
  ('Lak', 'JV'),
  ('sore-bengi', 'JV'),
  ('lemot', 'JV'),
  ("''", 'O'),
  ('@kecepoood', 'O'),
  (':', 'O'),
  ('XL', 'O'),
  ('labil', 'ID'),
  ('donlod', 'EN'),
  ('munggah', 'JV'),
  ('mudun', 'JV'),
  ('😩', 'O'),


In [5]:
X[0]

[{'token.lower': 'tumben',
  'n_gram_0': 'Tumbe',
  'token_BOS': True,
  'token_EOS': False,
  'prev_tag': '',
  'next_tag': 'O',
  'prev_2tag': '',
  'next_2tag': 'JV',
  'token.prefix_2': 'Tu',
  'token.prefix_3': 'Tum',
  'token.suffix_2': 'en',
  'token.suffix_3': 'ben',
  'token.length': 6,
  'token.is_alpha': True,
  'token.is_numeric': False,
  'token.is_capital': False,
  'token.is_title': True,
  'token.startswith_symbols': False,
  'token.contains_numeric': False,
  'token.contains_capital': True,
  'token.contains_quotes': False,
  'token.contains_hyphen': False},
 {'token.lower': 'xl',
  'n_gram_0': 'xl',
  'token_BOS': False,
  'token_EOS': False,
  'prev_tag': 'ID',
  'next_tag': 'JV',
  'prev_2tag': '',
  'next_2tag': 'JV',
  'token.prefix_2': 'xl',
  'token.prefix_3': 'xl',
  'token.suffix_2': 'xl',
  'token.suffix_3': 'xl',
  'token.length': 2,
  'token.is_alpha': True,
  'token.is_numeric': False,
  'token.is_capital': False,
  'token.is_title': False,
  'token.starts

In [4]:
labels = ['ID', 'JV', 'EN', 'O', 'MIX-ID-EN', 'MIX-ID-JV', 'MIX-JV-EN']
c1 = 0.1
c2 = 0.1

In [5]:
model = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',  # for gradient descent for optimization and getting model parameters
    c1 = c1, # Coefficient for Lasso (L1) regularization
    c2 = c2, # Coefficient for Ridge (L2) regularization
    max_iterations = 100, # The maximum number of iterations for optimization algorithms, iteration for the gradient descent optimization
    all_possible_transitions = True   # Specify whether CRFsuite generates transition features that do not even occur in the training data
)

In [6]:
params_space = {
	'c1': scipy.stats.expon(scale=0.5),
	'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
						average='weighted',
						labels=labels)

rs = RandomizedSearchCV(model,
						params_space,  # pass the dictionary of parameters that we need to optimize
						cv=10,  # Determines the cross-validation splitting strategy
						verbose=1,  # Controls the verbosity: the higher, the more messages
						n_jobs=-1,  # Number of jobs to run in parallel, -1 means using all processors
						n_iter=50,  # Number of parameter settings that are sampled
						scoring=f1_scorer)

rs.fit(X_train, y_train)

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
# print("%0.2f f1-score with a standard deviation of %0.2f" % (rs.mean(), rs.std()))

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 25.6min finished


best params: {'c1': 0.04077752338043632, 'c2': 0.08790533946499102}
best CV score: 0.9436329221668028
model size: 2.66M


In [7]:
import pandas as pd

df = pd.DataFrame(rs.cv_results_)

In [8]:
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_c1,param_c2,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,27.795088,3.20271,0.400441,0.066981,0.057467,0.03833,"{'c1': 0.05746704336330737, 'c2': 0.0383300727...",0.940264,0.947402,0.94018,0.94191,0.946112,0.941649,0.945951,0.944533,0.940392,0.943738,0.943213,0.002559,8
1,30.522127,5.748189,0.322312,0.050677,1.095192,0.016795,"{'c1': 1.0951923023373458, 'c2': 0.01679510496...",0.93511,0.940154,0.934281,0.933985,0.934357,0.934554,0.934207,0.937848,0.928321,0.937965,0.935078,0.003014,43
2,21.107664,1.916771,0.242401,0.013735,0.040778,0.087905,"{'c1': 0.04077752338043632, 'c2': 0.0879053394...",0.940824,0.94908,0.941429,0.942718,0.945306,0.942071,0.945696,0.944647,0.940724,0.943833,0.943633,0.002496,1
3,20.215668,0.966361,0.264646,0.027802,1.330395,0.192169,"{'c1': 1.3303947806401835, 'c2': 0.19216907238...",0.930084,0.93856,0.931403,0.930595,0.930258,0.931106,0.930744,0.935017,0.92461,0.933622,0.9316,0.003452,48
4,22.48921,1.071706,0.264843,0.04304,0.292447,0.03534,"{'c1': 0.2924472590944702, 'c2': 0.03533972635...",0.941417,0.945894,0.941438,0.940799,0.943089,0.941786,0.943471,0.944845,0.939228,0.942833,0.94248,0.001866,20
5,23.865717,1.636573,0.281131,0.043622,0.857985,0.023325,"{'c1': 0.8579848412315326, 'c2': 0.02332469443...",0.937148,0.942521,0.934636,0.937287,0.936784,0.935409,0.936829,0.939064,0.931915,0.939322,0.937091,0.002724,39
6,25.052143,1.048329,0.325926,0.102895,0.094597,0.043051,"{'c1': 0.09459667978310277, 'c2': 0.0430513439...",0.941208,0.947653,0.940867,0.941571,0.944768,0.940846,0.944759,0.944556,0.940509,0.94385,0.943059,0.002272,12
7,21.481659,1.451751,0.267725,0.036077,0.186375,0.020467,"{'c1': 0.18637541474202052, 'c2': 0.0204669720...",0.941089,0.946117,0.941688,0.942403,0.944101,0.941355,0.943901,0.944089,0.939869,0.943534,0.942815,0.001764,15
8,19.591672,0.943082,0.243744,0.048618,1.570079,0.054463,"{'c1': 1.5700791399327965, 'c2': 0.05446348421...",0.92987,0.93756,0.929333,0.929322,0.929457,0.930778,0.928667,0.933131,0.924046,0.933079,0.930524,0.003358,49
9,22.932439,1.190206,0.320708,0.118771,0.571419,0.044105,"{'c1': 0.5714189322481824, 'c2': 0.04410488010...",0.939478,0.944619,0.936987,0.939125,0.938154,0.939128,0.940337,0.941624,0.935727,0.941188,0.939637,0.002382,30


In [9]:
df.to_excel('cv_results.xlsx')