In [1]:
from helper.dataset_reader import read_tsv
from helper.data_transformer import *
from langid_crf import *
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics

In [2]:
langid = LanguageIdentifier()

In [3]:
## Hyperparameter Optimization
raw_data = read_tsv('../dataset/ijelid-100722.tsv')
data = to_token_tag_list(raw_data)
X = [langid.sent2features(s) for s in data]
y = [langid.sent2tags(s) for s in data]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [6]:
labels = ['ID', 'JV', 'EN', 'O', 'MIX-ID-EN', 'MIX-ID-JV', 'MIX-JV-EN']
c1 = 0.1
c2 = 0.1

In [7]:
model = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',  # for gradient descent for optimization and getting model parameters
    c1 = c1, # Coefficient for Lasso (L1) regularization
    c2 = c2, # Coefficient for Ridge (L2) regularization
    max_iterations = 100, # The maximum number of iterations for optimization algorithms, iteration for the gradient descent optimization
    all_possible_transitions = True   # Specify whether CRFsuite generates transition features that do not even occur in the training data
)

In [8]:
params_space = {
	'c1': scipy.stats.expon(scale=0.5),
	'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
						average='weighted',
						labels=labels)

rs = RandomizedSearchCV(model,
						params_space,  # pass the dictionary of parameters that we need to optimize
						cv=10,  # Determines the cross-validation splitting strategy
						verbose=1,  # Controls the verbosity: the higher, the more messages
						n_jobs=-1,  # Number of jobs to run in parallel, -1 means using all processors
						n_iter=50,  # Number of parameter settings that are sampled
						scoring=f1_scorer)

rs.fit(X_train, y_train)

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
# print("%0.2f f1-score with a standard deviation of %0.2f" % (rs.mean(), rs.std()))

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 22.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 25.6min finished


best params: {'c1': 0.008092215462248482, 'c2': 0.021144788156748907}
best CV score: 0.946026316998886
model size: 3.85M


In [9]:
import pandas as pd

df = pd.DataFrame(rs.cv_results_)

In [10]:
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_c1,param_c2,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,25.550396,1.141419,0.426498,0.058011,0.749675,0.029044,"{'c1': 0.7496749751322848, 'c2': 0.02904357692...",0.940388,0.945723,0.939722,0.940962,0.940075,0.939715,0.941105,0.943041,0.937071,0.941666,0.940947,0.002165,32
1,23.818856,0.809306,0.35385,0.026362,0.069537,0.060549,"{'c1': 0.0695365902530624, 'c2': 0.06054943737...",0.94315,0.950115,0.943948,0.943404,0.948327,0.943103,0.947581,0.947071,0.943904,0.945827,0.945643,0.002379,5
2,24.462843,0.317205,0.326629,0.023132,0.445772,0.004319,"{'c1': 0.44577240001344776, 'c2': 0.0043194838...",0.942133,0.948011,0.942067,0.942574,0.944443,0.942461,0.943225,0.945601,0.942974,0.944806,0.943829,0.001802,22
3,23.855268,0.435397,0.369512,0.067438,1.309619,0.006458,"{'c1': 1.3096191238370996, 'c2': 0.00645811990...",0.935448,0.941478,0.936833,0.936453,0.935362,0.935564,0.936741,0.938955,0.931462,0.938215,0.936651,0.002497,42
4,24.138287,0.645799,0.347217,0.036774,1.752258,0.011371,"{'c1': 1.7522576423760565, 'c2': 0.01137063794...",0.932261,0.939027,0.934157,0.931361,0.934232,0.932572,0.932928,0.936653,0.928212,0.934961,0.933637,0.002805,47
5,23.591,0.42481,0.319803,0.032954,1.199857,0.135701,"{'c1': 1.1998574796085226, 'c2': 0.13570098052...",0.935067,0.941262,0.936817,0.935159,0.935458,0.935509,0.935471,0.938223,0.930919,0.938547,0.936243,0.002596,43
6,24.174649,0.479168,0.348576,0.027265,0.105803,0.000387,"{'c1': 0.10580349000653846, 'c2': 0.0003866935...",0.942056,0.948126,0.942574,0.942051,0.946229,0.942163,0.946289,0.947941,0.942047,0.945808,0.944528,0.00245,17
7,24.983633,0.817954,0.40038,0.049788,1.050735,0.03983,"{'c1': 1.0507346992223308, 'c2': 0.03983035912...",0.937432,0.94314,0.936684,0.93738,0.938078,0.936739,0.938265,0.940285,0.934347,0.939811,0.938216,0.002278,39
8,24.631219,0.689542,0.330648,0.041639,0.55596,0.004441,"{'c1': 0.5559601423108158, 'c2': 0.00444063614...",0.941209,0.947081,0.941376,0.941086,0.942544,0.941204,0.941904,0.944295,0.940813,0.943377,0.942489,0.001865,26
9,24.037446,0.63746,0.321166,0.035749,0.753057,0.022527,"{'c1': 0.7530567280461249, 'c2': 0.02252686092...",0.940396,0.945862,0.938927,0.940167,0.940053,0.939284,0.941569,0.942831,0.937414,0.941892,0.94084,0.002234,33


In [11]:
df.to_excel('cv_results.xlsx')