In [1]:
import csv

import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report

In [96]:
dataTrain = []
with open("dataTrain.tsv", encoding='utf-8') as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    sentence = []
    for row in rd:
        if not row:
            dataTrain.append(sentence)
            sentence = []
        else:
            sentence.append(row)

dataTest = []
with open("dataTest.tsv", encoding='utf-8') as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    sentence = []
    for row in rd:
        if not row:
            dataTest.append(sentence)
            sentence = []
        else:
            sentence.append(row)
dataTrain = list(filter(None, dataTrain))            
dataTest = list(filter(None, dataTest))  

In [97]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [98]:
X_train = [sent2features(s) for s in dataTrain]
y_train = [sent2labels(s) for s in dataTrain]

X_test = [sent2features(s) for s in dataTest]
y_test = [sent2labels(s) for s in dataTest]

In [99]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [100]:
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
labels.remove('O') # remove 'O' label from evaluation
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

       B-BRA      0.577     0.326     0.417        46
       I-BRA      0.400     0.222     0.286         9
       B-PRO      0.500     0.095     0.160        21
       I-PRO      0.355     0.241     0.287       112
 B-PRO|B-BRA      0.696     0.506     0.586        77
 I-PRO|B-BRA      1.000     0.148     0.258        27
 B-PRO|B-TYP      0.000     0.000     0.000        11
 I-PRO|B-TYP      0.651     0.412     0.505        68
 I-PRO|I-BRA      0.667     0.214     0.324        28
 I-PRO|I-TYP      0.548     0.378     0.447        90
       B-TYP      0.000     0.000     0.000         9
       I-TYP      0.000     0.000     0.000         6

   micro avg      0.547     0.312     0.397       504
   macro avg      0.450     0.212     0.273       504
weighted avg      0.542     0.312     0.383       504



In [101]:
dataTest_labels = []
for sentence in dataTest:
    aa = [token[1] for token in sentence]
    dataTest_labels.append(aa)
    
dataTest_token = []
for sentence in dataTest:
    aa = [token[0] for token in sentence]
    dataTest_token.append(aa)

#print(dataTest_labels[0])
#print(dataTest_token[0])

In [104]:
bra_data = []
bra_guess = []
typ_data = []
typ_guess = []
pro_data = []
pro_guess = []
o_data = []
o_guess = []
for i in range(len(dataTest)):
    bra_data_row = []
    bra_guess_row = []
    typ_data_row = []
    typ_guess_row = []
    pro_data_row = []
    pro_guess_row = []
    o_data_row = []
    o_guess_row = []
    sen_pro_data = ''
    sen_pro_guess = ''
    sen_typ_data = ''
    sen_typ_guess = ''
    sen_bra_data = ''
    sen_bra_guess = ''
    for j in range(len(dataTest[i])):        
        if dataTest_labels[i][j] == 'O':
            o_data_row.append(dataTest_token[i][j])
            if sen_pro_data:
                pro_data_row.append(sen_pro_data)
                sen_pro_data = ''
            elif sen_bra_data:
                bra_data_row.append(sen_bra_data)
                sen_bra_data = ''
            elif sen_typ_data:
                typ_data_row.append(sen_typ_data)
                sen_typ_data = ''
        else:
            if '|' in dataTest_labels[i][j]:
                if sen_pro_data and dataTest_labels[i][j][0] == 'B':
                    pro_data_row.append(sen_pro_data)
                    sen_pro_data = ''
                    sen_pro_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'B':
                    sen_pro_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'I':
                    sen_pro_data += ' ' + dataTest_token[i][j]
                if dataTest_labels[i][j][8:] == 'BRA':
                    if sen_bra_data and dataTest_labels[i][j][6:7] == 'B':
                        bra_data_row.append(sen_bra_data)
                        sen_bra_data = ''
                        sen_bra_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][6:7] == 'B':
                        sen_bra_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][6:7] == 'I':
                        sen_bra_data += ' ' + dataTest_token[i][j]
                elif dataTest_labels[i][j][8:] == 'TYP':
                    if sen_typ_data and dataTest_labels[i][j][6:7] == 'B':
                        typ_data_row.append(sen_typ_data)
                        sen_typ_data = ''
                        sen_typ_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][6:7] == 'B':
                        sen_typ_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][6:7] == 'I':
                        sen_typ_data += ' ' + dataTest_token[i][j]
            elif dataTest_labels[i][j][2:5] == 'PRO':
                if sen_pro_data and dataTest_labels[i][j][0] == 'B':
                    pro_data_row.append(sen_pro_data)
                    sen_pro_data = ''
                    sen_pro_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'B':
                    sen_pro_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'I':
                    sen_pro_data += ' ' + dataTest_token[i][j]
            elif dataTest_labels[i][j][2:5] == 'BRA':
                if sen_bra_data and dataTest_labels[i][j][0] == 'B':
                    bra_data_row.append(sen_bra_data)
                    sen_bra_data = ''
                    sen_bra_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'B':
                    sen_bra_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'I':
                    sen_bra_data += ' ' + dataTest_token[i][j]
            elif dataTest_labels[i][j][2:5] == 'TYP':
                if sen_typ_data and dataTest_labels[i][j][0] == 'B':
                    typ_data_row.append(sen_typ_data)
                    sen_typ_data = ''
                    sen_typ_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'B':
                    sen_typ_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'I':
                    sen_typ_data += ' ' + dataTest_token[i][j]
        if y_pred[i][j] == 'O':
            o_guess_row.append(dataTest_token[i][j])
            if sen_pro_guess:
                pro_guess_row.append(sen_pro_guess)
                sen_pro_guess = ''
            elif sen_bra_guess:
                bra_guess_row.append(sen_bra_guess)
                sen_bra_guess = ''
            elif sen_typ_guess:
                typ_guess_row.append(sen_typ_guess)
                sen_typ_guess = ''
        else:
            if '|' in y_pred[i][j]:
                if sen_pro_guess and y_pred[i][j][0] == 'B':
                    pro_guess_row.append(sen_pro_guess)
                    sen_pro_guess = ''
                    sen_pro_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'B':
                    sen_pro_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'I':
                    sen_pro_guess += ' ' + dataTest_token[i][j]
                if y_pred[i][j][8:] == 'BRA':
                    if sen_bra_guess and y_pred[i][j][6:7] == 'B':
                        bra_guess_row.append(sen_bra_guess)
                        sen_bra_guess = ''
                        sen_bra_guess += dataTest_token[i][j]
                    elif y_pred[i][j][6:7] == 'B':
                        sen_bra_guess += dataTest_token[i][j]
                    elif y_pred[i][j][6:7] == 'I':
                        sen_bra_guess += ' ' + dataTest_token[i][j]
                elif y_pred[i][j][8:] == 'TYP':
                    if sen_typ_guess and y_pred[i][j][6:7] == 'B':
                        typ_guess_row.append(sen_typ_guess)
                        sen_typ_guess = ''
                        sen_typ_guess += dataTest_token[i][j]
                    elif y_pred[i][j][6:7] == 'B':
                        sen_typ_guess += dataTest_token[i][j]
                    elif y_pred[i][j][6:7] == 'I':
                        sen_typ_guess += ' ' + dataTest_token[i][j]
            elif y_pred[i][j][2:5] == 'PRO':
                if sen_pro_guess and y_pred[i][j][0] == 'B':
                    pro_guess_row.append(sen_pro_guess)
                    sen_pro_guess = ''
                    sen_pro_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'B':
                    sen_pro_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'I':
                    sen_pro_guess += ' ' + dataTest_token[i][j]
            elif y_pred[i][j][2:5] == 'BRA':
                if sen_bra_guess and y_pred[i][j][0] == 'B':
                    bra_guess_row.append(sen_bra_guess)
                    sen_bra_guess = ''
                    sen_bra_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'B':
                    sen_bra_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'I':
                    sen_bra_guess += ' ' + dataTest_token[i][j]
            elif y_pred[i][j][2:5] == 'TYP':
                if sen_typ_guess and y_pred[i][j][0] == 'B':
                    typ_guess_row.append(sen_typ_guess)
                    sen_typ_guess = ''
                    sen_typ_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'B':
                    sen_typ_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'I':
                    sen_typ_guess += ' ' + dataTest_token[i][j]
    if sen_pro_data:
        pro_data_row.append(sen_pro_data)
    elif sen_bra_data:
        bra_data_row.append(sen_bra_data)
    elif sen_typ_data:
        typ_data_row.append(sen_typ_data)
    if sen_pro_guess:
        pro_guess_row.append(sen_pro_guess)
    elif sen_bra_guess:
        bra_guess_row.append(sen_bra_guess)
    elif sen_typ_guess:
        typ_guess_row.append(sen_typ_guess)
    o_data.append(o_data_row)
    o_guess.append(o_guess_row)
    pro_data.append(pro_data_row)
    pro_guess.append(pro_guess_row)
    bra_data.append(bra_data_row)
    bra_guess.append(bra_guess_row)
    typ_data.append(typ_data_row)
    typ_guess.append(typ_guess_row)
'''
for i in range(len(bra_data)):
    print('PRO')
    print(pro_data[i])
    print(pro_guess[i])
    print('BRA')
    print(bra_data[i])
    print(bra_guess[i])
    print('TYP')
    print(typ_data[i])
    print(typ_guess[i])
    print('---------')
'''
#none is the actual number from data test, r is number of guess right, g is number of guess
bra = 0
bra_r = 0
bra_g = 0
typ = 0
typ_r = 0
typ_g = 0
pro = 0
pro_r = 0
pro_g = 0
o = 0
o_r = 0
o_g = 0
for i in range(len(pro_data)):
    for j in range(len(pro_data[i])):
        pro += 1
    for j in range(len(pro_guess[i])):
        pro_g += 1
        if pro_guess[i][j] in pro_data[i]:
            pro_r += 1
            pro_data[i].remove(pro_guess[i][j])
    for j in range(len(bra_data[i])):
        bra += 1
    for j in range(len(bra_guess[i])):
        bra_g += 1
        if bra_guess[i][j] in bra_data[i]:
            bra_r += 1
            bra_data[i].remove(bra_guess[i][j])
    for j in range(len(typ_data[i])):
        typ += 1
    for j in range(len(typ_guess[i])):
        typ_g += 1
        if typ_guess[i][j] in typ_data[i]:
            typ_r += 1
            typ_data[i].remove(typ_guess[i][j])
    for j in range(len(o_data[i])):
        o += 1
    for j in range(len(o_guess[i])):
        o_g += 1
        if o_guess[i][j] in o_data[i]:
            o_r += 1
            o_data[i].remove(o_guess[i][j])

print('PRO')
precision_pro = pro_r/pro_g
recall_pro = pro_r/pro
f1_pro = 2 * ((precision_pro * recall_pro)/(precision_pro + recall_pro))
print('Precision : ' + str(precision_pro))
print('Recall : ' + str(recall_pro))
print('F1 : ' + str(f1_pro))
print('BRA')
precision_bra = bra_r/bra_g
recall_bra = bra_r/bra
f1_bra = 2 * ((precision_bra * recall_bra)/(precision_bra + recall_bra))
print('Precision : ' + str(precision_bra))
print('Recall : ' + str(recall_bra))
print('F1 : ' + str(f1_bra))
print('TYP')
precision_typ = typ_r/typ_g
recall_typ = typ_r/typ
f1_typ = 2 * ((precision_typ * recall_typ)/(precision_typ + recall_typ))
print('Precision : ' + str(precision_typ))
print('Recall : ' + str(recall_typ))
print('F1 : ' + str(f1_typ))
print('O')
precision_o = o_r/o_g
recall_o = o_r/o
f1_o = 2 * ((precision_o * recall_o)/(precision_o + recall_o))
print('Precision : ' + str(precision_o))
print('Recall : ' + str(recall_o))
print('F1 : ' + str(f1_o))
print('Overall Without O')
total = pro + typ + bra
precision = ((pro * precision_pro) + (bra * precision_bra) + (typ * precision_typ)) / total
recall = ((pro * recall_pro) + (bra * recall_bra) + (typ * recall_typ)) / total
f1 = ((pro * f1_pro) + (bra * f1_bra) + (typ * f1_typ)) / total
print('Precision : ' + str(precision))
print('Recall : ' + str(recall))
print('F1 : ' + str(f1))
print('Overall With O')
total = pro + typ + bra + o
precision = ((pro * precision_pro) + (bra * precision_bra) + (typ * precision_typ) + (o * precision_o)) / total
recall = ((pro * recall_pro) + (bra * recall_bra) + (typ * recall_typ) + (o * recall_o)) / total
f1 = ((pro * f1_pro) + (bra * f1_bra) + (typ * f1_typ) + (o * f1_o)) / total
print('Precision : ' + str(precision))
print('Recall : ' + str(recall))
print('F1 : ' + str(f1))

PRO
Precision : 0.65
Recall : 0.3577981651376147
F1 : 0.4615384615384616
BRA
Precision : 0.7123287671232876
Recall : 0.42276422764227645
F1 : 0.5306122448979592
TYP
Precision : 0.5555555555555556
Recall : 0.25862068965517243
F1 : 0.3529411764705883
O
Precision : 0.8934603724556085
Recall : 0.9861376673040153
F1 : 0.9375142013178824
Overall Without O
Precision : 0.6575471054427124
Recall : 0.36551724137931035
F1 : 0.4691158160877083
Overall With O
Precision : 0.8647387740367422
Recall : 0.9105793450881612
F1 : 0.8804883693629073


In [89]:
print(pro_guess)

[[], [], [], [], ['shampoo ginvera green tea scalp protection'], [], [], [], ['Shampoo phyto', 'ginvera green tea'], [], [], [], [], [], [], [], [], ['Bumble & bumble shampoo'], [], [], [], ['Bumble & Bumble Scalp Rebalancing 3 sets (shampoo, conditioner, and serum)'], [], [], [], [], [], [], [], [], [], [], [], ['dove yg warna biru tua yg intense damage therapy'], [], [], [], [], ['burt bees mango', 'bodyhop rainforest', 'loccitane aromachologie'], [], [], ['shampoo dr MooGoo'], [], [], ['loreal ever strong'], [], [], [], [], [], [], [], [], [], ['Rainforest TBS'], [], [], [], [], ['shampoo emeron', 'shampoo lifebouy'], [], [], ['emeron kuning'], [], [], ['shampoo lifebuoy'], ['emeron kuning', 'smpo bayem mustika ratu'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['ghd classic IV styler', 'ghd gold styler'], [], [], ['shampoo yang Baim Volumizing'], [], ['shamppo yang Baim Prevention'], [], [], [], [], [], [], [], [], [], [], [], [], [], ['shampoo mane n tail', 'condi