In [113]:
import csv
import nltk
import unicodedata
import numpy as np
from weighted_levenshtein import lev, osa, dam_lev
from string import ascii_lowercase
from copy import deepcopy
import json

import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import sequence_accuracy_score

In [3]:
brand = []
with open('brand.csv', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        brand.append(unicodedata.normalize('NFKD', row[0]).encode('ascii','ignore'))
#print(brand)

brand_abb = []
with open('brand_singkatan.csv', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        brand_abb.append(unicodedata.normalize('NFKD', row[0]).encode('ascii','ignore'))
#print(brand_abb)

In [30]:
alfha = 0.4

insert_costs = np.full(128, 100, dtype=np.float64)
insert_costs[ord('-')] = 10
insert_costs[ord(' ')] = 10

delete_costs = np.full(128, 100, dtype=np.float64)
delete_costs[ord('-')] = 10
delete_costs[ord(' ')] = 10

substitute_costs = np.full((128,128), 50, dtype=np.float64)
for c in ascii_lowercase:
    substitute_costs[ord(c), ord(c.capitalize())] = 10
    substitute_costs[ord(c), ord(c)] = 0
    substitute_costs[ord(c.capitalize()), ord(c)] = 10
    substitute_costs[ord(c.capitalize()), ord(c.capitalize())] = 0
substitute_costs[ord('-'), ord(' ')] = 10
substitute_costs[ord(' '), ord('-')] = 10
for i in range(10):
    for j in range(10):
        if i == j:
            substitute_costs[ord(str(i)), ord(str(j))] = 0
            substitute_costs[ord(str(j)), ord(str(i))] = 0
        else:
            substitute_costs[ord(str(i)), ord(str(j))] = 10
            substitute_costs[ord(str(j)), ord(str(i))] = 10

def edit_distance_normalized_cost(word, target):
    cost = lev(word, target, insert_costs=insert_costs, delete_costs=delete_costs, substitute_costs=substitute_costs)
    return (cost + alfha) / len(target)

def check_under_threshold(cost, threshold):
    if cost <= threshold:
        return True
    else:
        return False

def check_edit_distance_brand(sentence, pos):
    threshold = 15
    words = sentence.split()
    candidate = []
    candidate.append(words[pos])
    if pos >= 0:
        if pos < (len(words) - 1):
            candidate.append(words[pos] + " " + words[pos + 1])
        #if pos < (len(words) - 2):
        #      candidate.append(words[pos] + " " + words[pos + 1] + " " + words[pos + 2])
    if (pos - 1) >= 0:
        candidate.append(words[pos - 1] + " " + words[pos])
        if pos < (len(words) - 1):
            candidate.append(words[pos - 1] + " " + words[pos] + " " + words[pos + 1])
        #if pos < (len(words) - 2):
        #    candidate.append(words[pos - 1] + " " + words[pos] + " " + words[pos + 1] + " " + words[pos + 2])
    #if (pos - 2) >= 0:
    #    candidate.append(words[pos - 2] + " " + words[pos - 1] + " " + words[pos])
        #if pos < (len(words) - 1):
        #    candidate.append(words[pos - 2] + " " + words[pos - 1] + " " + words[pos] + " " + words[pos + 1])
        #if pos < (len(words) - 2):
        #    candidate.append(words[pos - 2] + " " + words[pos - 1] + " " + words[pos] + " " + words[pos + 1] + " " + words[pos + 2])
    candidate.sort(key = lambda s: len(s))
    exist = False
    for c in candidate:
        for b in brand:
            zzzz = unicodedata.normalize('NFKD', c).encode('ascii','ignore')
            if check_under_threshold(edit_distance_normalized_cost(zzzz,b),threshold):
                exist = True
                break
    return exist
#print(check_edit_distance_brand('Acquarella',0))

def check_edit_distance_brand_abb(word):
    threshold = 5
    exist = False
    for b in brand_abb:
        zzzz = unicodedata.normalize('NFKD', word).encode('ascii','ignore')
        if check_under_threshold(edit_distance_normalized_cost(zzzz,b),threshold) :
            exist = True
            break
    return exist
#print(check_edit_distance_brand_abb('Bb'))

In [31]:
unlabeled = []
with open("unlabeled.txt", encoding='utf-8') as fd:
    for line in fd:
        sentence = line[:-2]
        tokens = nltk.tokenize.word_tokenize(sentence)
        unlabeled.append(tokens)
#print(unlabeled[2])

In [32]:
dataTrain = []
with open("dataTrain.tsv", encoding='utf-8') as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    sentence = []
    for row in rd:
        if not row:
            dataTrain.append(sentence)
            sentence = []
        else:
            sentence.append(row)

dataTest = []
with open("dataTest.tsv", encoding='utf-8') as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    sentence = []
    for row in rd:
        if not row:
            dataTest.append(sentence)
            sentence = []
        else:
            sentence.append(row)
dataTrain = list(filter(None, dataTrain))            
dataTest = list(filter(None, dataTest))  
#print(dataTrain[0])

In [33]:
def word2features(sent, i):
    word = sent[i][0]
    #postag = sent[i][1]
    sentence = ''
    for w in sent:
        sentence += w[0] + " "
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.onList': check_edit_distance_brand(sentence,i),
        'word.onListAbb': check_edit_distance_brand_abb(word)
        #'postag': postag,
        #'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        #postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'-1:onList': check_edit_distance_brand(sentence,(i - 1)),
            #'-1:postag': postag1,
            #'-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        #postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            #'+1:postag': postag1,
            #'+1:postag[:2]': postag1[:2],
            #'+1:onList': check_edit_distance_brand(sentence,(i + 1)),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [34]:
X_train = [sent2features(s) for s in dataTrain]
y_train = [sent2labels(s) for s in dataTrain]

X_test = [sent2features(s) for s in dataTest]
y_test = [sent2labels(s) for s in dataTest]

In [35]:
X_unlabeled = [sent2features(s) for s in unlabeled]

In [116]:
'''
a = [1,2,3]
if a:
    print(a)
while(a):
    a.pop()
print(a)
print(X_unlabeled[0])
print(y_train[0])
i = 3
while(i > 0):
    i -= 3
    print('boo')
'''
#print(X_train[0][0])
#print(X_unlabeled[0][0])
json.dump(X_train, open("X_train.txt",'w'))
json.dump(y_train, open("y_train.txt",'w'))
json.dump(X_test, open("X_test.txt",'w'))
json.dump(y_test, open("y_test.txt",'w'))
json.dump(X_unlabeled, open("X_unlabeled.txt",'w'))
#d2 = json.load(open("X_train.txt"))
#print(d2[0][0])

In [117]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [37]:
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
labels.remove('O') # remove 'O' label from evaluation
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

              precision    recall  f1-score   support

       B-BRA      0.641     0.543     0.588        46
       I-BRA      0.333     0.444     0.381         9
       B-PRO      0.500     0.190     0.276        21
       I-PRO      0.361     0.312     0.335       112
 B-PRO|B-BRA      0.779     0.688     0.731        77
 I-PRO|B-BRA      0.769     0.370     0.500        27
 B-PRO|B-TYP      0.000     0.000     0.000        11
 I-PRO|B-TYP      0.600     0.441     0.508        68
 I-PRO|I-BRA      0.750     0.536     0.625        28
 I-PRO|I-TYP      0.500     0.333     0.400        90
       B-TYP      0.000     0.000     0.000         9
       I-TYP      0.000     0.000     0.000         6

   micro avg      0.558     0.409     0.472       504
   macro avg      0.436     0.322     0.362       504
weighted avg      0.538     0.409     0.460       504



In [None]:
'''
for i in range(len(y_test)):
    print(y_test[i])
    print(y_pred[i])
    print(sequence_accuracy_score(y_test[i], y_pred[i]))
'''
X_train_sup = deepcopy(X_train)
y_train_sup = deepcopy(y_train)
X_unlabeled_sup = deepcopy(X_unlabeled)
unlabeled_sup = deepcopy(unlabeled)
upper = 0.95
lower = 0.9
w = 0
label_for_unlabeled = []
while(X_unlabeled_sup):#or w < 1 (default: X_unlabeled_sup)
    print(str(w) + '-' + str(len(X_unlabeled_sup)) + '- lower:' + str(lower) + ' - upper:' + str(upper))
    found = False
    num = []
    w += 1
    for z in range(len(X_unlabeled_sup)): #X_test/X_unlabeled
        a = crf.predict_marginals_single(X_unlabeled_sup[z])
        b = crf.predict_single(X_unlabeled_sup[z]) #same, but not used becaused it's already predicted before, it's in y_pred
        #b = y_pred[z]
        total = 0
        size = len(a)
        for i in range(size):
            total += a[i][b[i]]
        confidence = total / size
        #print(confidence)
        if confidence < upper and confidence > lower:
            found = True
            row = []
            row.append(unlabeled_sup[z])
            row.append(b)
            label_for_unlabeled.append(row)
            X_train_sup.append(X_unlabeled_sup[z])
            y_train_sup.append(b)
            num.append(z)
    if found == True:
        num.sort(reverse=True)
        for i in num:
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
        crf.fit(X_train_sup, y_train_sup)
    else:
        if lower > 0.6:
            lower -= 0.01
        else:
            lower -= 0.05
        if upper < 0.99:
            if upper >= 0.98:
                upper += 0.001
            else:
                upper += 0.005
        elif lower < 0.4:
            upper += 0.001
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
labels.remove('O') # remove 'O' label from evaluation
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
#print(label_for_unlabeled)

0-4488- lower:0.9 - upper:0.95
1-3819- lower:0.9 - upper:0.95
2-3495- lower:0.9 - upper:0.95
3-3375- lower:0.9 - upper:0.95
4-3300- lower:0.9 - upper:0.95
5-3263- lower:0.9 - upper:0.95
6-3238- lower:0.9 - upper:0.95
7-3224- lower:0.9 - upper:0.95
8-3215- lower:0.9 - upper:0.95
9-3209- lower:0.9 - upper:0.95
10-3204- lower:0.9 - upper:0.95
11-3202- lower:0.9 - upper:0.95
12-3202- lower:0.89 - upper:0.955
13-3200- lower:0.89 - upper:0.955
14-3199- lower:0.89 - upper:0.955
15-3198- lower:0.89 - upper:0.955
16-3198- lower:0.88 - upper:0.96
17-3192- lower:0.88 - upper:0.96
18-3187- lower:0.88 - upper:0.96
19-3182- lower:0.88 - upper:0.96
20-3175- lower:0.88 - upper:0.96
21-3168- lower:0.88 - upper:0.96
22-3164- lower:0.88 - upper:0.96
23-3161- lower:0.88 - upper:0.96
24-3160- lower:0.88 - upper:0.96
25-3157- lower:0.88 - upper:0.96
26-3156- lower:0.88 - upper:0.96
27-3151- lower:0.88 - upper:0.96
28-3148- lower:0.88 - upper:0.96
29-3147- lower:0.88 - upper:0.96
30-3147- lower:0.87 - upper:

In [None]:
dataTest_labels = []
for sentence in dataTest:
    aa = [token[1] for token in sentence]
    dataTest_labels.append(aa)
    
dataTest_token = []
for sentence in dataTest:
    aa = [token[0] for token in sentence]
    dataTest_token.append(aa)

#print(dataTest_labels[0])
#print(dataTest_token[0])

In [None]:
bra_data = []
bra_guess = []
typ_data = []
typ_guess = []
pro_data = []
pro_guess = []
o_data = []
o_guess = []
for i in range(len(dataTest)):
    bra_data_row = []
    bra_guess_row = []
    typ_data_row = []
    typ_guess_row = []
    pro_data_row = []
    pro_guess_row = []
    o_data_row = []
    o_guess_row = []
    sen_pro_data = ''
    sen_pro_guess = ''
    sen_typ_data = ''
    sen_typ_guess = ''
    sen_bra_data = ''
    sen_bra_guess = ''
    for j in range(len(dataTest[i])):        
        if dataTest_labels[i][j] == 'O':
            o_data_row.append(dataTest_token[i][j])
            if sen_pro_data:
                pro_data_row.append(sen_pro_data)
                sen_pro_data = ''
            elif sen_bra_data:
                bra_data_row.append(sen_bra_data)
                sen_bra_data = ''
            elif sen_typ_data:
                typ_data_row.append(sen_typ_data)
                sen_typ_data = ''
        else:
            if '|' in dataTest_labels[i][j]:
                if sen_pro_data and dataTest_labels[i][j][0] == 'B':
                    pro_data_row.append(sen_pro_data)
                    sen_pro_data = ''
                    sen_pro_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'B':
                    sen_pro_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'I':
                    sen_pro_data += ' ' + dataTest_token[i][j]
                if dataTest_labels[i][j][8:] == 'BRA':
                    if sen_bra_data and dataTest_labels[i][j][6:7] == 'B':
                        bra_data_row.append(sen_bra_data)
                        sen_bra_data = ''
                        sen_bra_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][6:7] == 'B':
                        sen_bra_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][6:7] == 'I':
                        sen_bra_data += ' ' + dataTest_token[i][j]
                elif dataTest_labels[i][j][8:] == 'TYP':
                    if sen_typ_data and dataTest_labels[i][j][6:7] == 'B':
                        typ_data_row.append(sen_typ_data)
                        sen_typ_data = ''
                        sen_typ_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][6:7] == 'B':
                        sen_typ_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][6:7] == 'I':
                        sen_typ_data += ' ' + dataTest_token[i][j]
            elif dataTest_labels[i][j][2:5] == 'PRO':
                if sen_pro_data and dataTest_labels[i][j][0] == 'B':
                    pro_data_row.append(sen_pro_data)
                    sen_pro_data = ''
                    sen_pro_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'B':
                    sen_pro_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'I':
                    sen_pro_data += ' ' + dataTest_token[i][j]
            elif dataTest_labels[i][j][2:5] == 'BRA':
                if sen_bra_data and dataTest_labels[i][j][0] == 'B':
                    bra_data_row.append(sen_bra_data)
                    sen_bra_data = ''
                    sen_bra_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'B':
                    sen_bra_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'I':
                    sen_bra_data += ' ' + dataTest_token[i][j]
            elif dataTest_labels[i][j][2:5] == 'TYP':
                if sen_typ_data and dataTest_labels[i][j][0] == 'B':
                    typ_data_row.append(sen_typ_data)
                    sen_typ_data = ''
                    sen_typ_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'B':
                    sen_typ_data += dataTest_token[i][j]
                elif dataTest_labels[i][j][0] == 'I':
                    sen_typ_data += ' ' + dataTest_token[i][j]
        if y_pred[i][j] == 'O':
            o_guess_row.append(dataTest_token[i][j])
            if sen_pro_guess:
                pro_guess_row.append(sen_pro_guess)
                sen_pro_guess = ''
            elif sen_bra_guess:
                bra_guess_row.append(sen_bra_guess)
                sen_bra_guess = ''
            elif sen_typ_guess:
                typ_guess_row.append(sen_typ_guess)
                sen_typ_guess = ''
        else:
            if '|' in y_pred[i][j]:
                if sen_pro_guess and y_pred[i][j][0] == 'B':
                    pro_guess_row.append(sen_pro_guess)
                    sen_pro_guess = ''
                    sen_pro_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'B':
                    sen_pro_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'I':
                    sen_pro_guess += ' ' + dataTest_token[i][j]
                if y_pred[i][j][8:] == 'BRA':
                    if sen_bra_guess and y_pred[i][j][6:7] == 'B':
                        bra_guess_row.append(sen_bra_guess)
                        sen_bra_guess = ''
                        sen_bra_guess += dataTest_token[i][j]
                    elif y_pred[i][j][6:7] == 'B':
                        sen_bra_guess += dataTest_token[i][j]
                    elif y_pred[i][j][6:7] == 'I':
                        sen_bra_guess += ' ' + dataTest_token[i][j]
                elif y_pred[i][j][8:] == 'TYP':
                    if sen_typ_guess and y_pred[i][j][6:7] == 'B':
                        typ_guess_row.append(sen_typ_guess)
                        sen_typ_guess = ''
                        sen_typ_guess += dataTest_token[i][j]
                    elif y_pred[i][j][6:7] == 'B':
                        sen_typ_guess += dataTest_token[i][j]
                    elif y_pred[i][j][6:7] == 'I':
                        sen_typ_guess += ' ' + dataTest_token[i][j]
            elif y_pred[i][j][2:5] == 'PRO':
                if sen_pro_guess and y_pred[i][j][0] == 'B':
                    pro_guess_row.append(sen_pro_guess)
                    sen_pro_guess = ''
                    sen_pro_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'B':
                    sen_pro_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'I':
                    sen_pro_guess += ' ' + dataTest_token[i][j]
            elif y_pred[i][j][2:5] == 'BRA':
                if sen_bra_guess and y_pred[i][j][0] == 'B':
                    bra_guess_row.append(sen_bra_guess)
                    sen_bra_guess = ''
                    sen_bra_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'B':
                    sen_bra_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'I':
                    sen_bra_guess += ' ' + dataTest_token[i][j]
            elif y_pred[i][j][2:5] == 'TYP':
                if sen_typ_guess and y_pred[i][j][0] == 'B':
                    typ_guess_row.append(sen_typ_guess)
                    sen_typ_guess = ''
                    sen_typ_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'B':
                    sen_typ_guess += dataTest_token[i][j]
                elif y_pred[i][j][0] == 'I':
                    sen_typ_guess += ' ' + dataTest_token[i][j]
    if sen_pro_data:
        pro_data_row.append(sen_pro_data)
    elif sen_bra_data:
        bra_data_row.append(sen_bra_data)
    elif sen_typ_data:
        typ_data_row.append(sen_typ_data)
    if sen_pro_guess:
        pro_guess_row.append(sen_pro_guess)
    elif sen_bra_guess:
        bra_guess_row.append(sen_bra_guess)
    elif sen_typ_guess:
        typ_guess_row.append(sen_typ_guess)
    o_data.append(o_data_row)
    o_guess.append(o_guess_row)
    pro_data.append(pro_data_row)
    pro_guess.append(pro_guess_row)
    bra_data.append(bra_data_row)
    bra_guess.append(bra_guess_row)
    typ_data.append(typ_data_row)
    typ_guess.append(typ_guess_row)
'''
for i in range(len(bra_data)):
    print('PRO')
    print(pro_data[i])
    print(pro_guess[i])
    print('BRA')
    print(bra_data[i])
    print(bra_guess[i])
    print('TYP')
    print(typ_data[i])
    print(typ_guess[i])
    print('---------')
'''
#none is the actual number from data test, r is number of guess right, g is number of guess
bra = 0
bra_r = 0
bra_g = 0
typ = 0
typ_r = 0
typ_g = 0
pro = 0
pro_r = 0
pro_g = 0
o = 0
o_r = 0
o_g = 0
for i in range(len(pro_data)):
    for j in range(len(pro_data[i])):
        pro += 1
    for j in range(len(pro_guess[i])):
        pro_g += 1
        if pro_guess[i][j] in pro_data[i]:
            pro_r += 1
            pro_data[i].remove(pro_guess[i][j])
    for j in range(len(bra_data[i])):
        bra += 1
    for j in range(len(bra_guess[i])):
        bra_g += 1
        if bra_guess[i][j] in bra_data[i]:
            bra_r += 1
            bra_data[i].remove(bra_guess[i][j])
    for j in range(len(typ_data[i])):
        typ += 1
    for j in range(len(typ_guess[i])):
        typ_g += 1
        if typ_guess[i][j] in typ_data[i]:
            typ_r += 1
            typ_data[i].remove(typ_guess[i][j])
    for j in range(len(o_data[i])):
        o += 1
    for j in range(len(o_guess[i])):
        o_g += 1
        if o_guess[i][j] in o_data[i]:
            o_r += 1
            o_data[i].remove(o_guess[i][j])

print('PRO')
precision_pro = pro_r/pro_g
recall_pro = pro_r/pro
f1_pro = 2 * ((precision_pro * recall_pro)/(precision_pro + recall_pro))
print('Precision : ' + str(precision_pro))
print('Recall : ' + str(recall_pro))
print('F1 : ' + str(f1_pro))
print('BRA')
precision_bra = bra_r/bra_g
recall_bra = bra_r/bra
f1_bra = 2 * ((precision_bra * recall_bra)/(precision_bra + recall_bra))
print('Precision : ' + str(precision_bra))
print('Recall : ' + str(recall_bra))
print('F1 : ' + str(f1_bra))
print('TYP')
precision_typ = typ_r/typ_g
recall_typ = typ_r/typ
f1_typ = 2 * ((precision_typ * recall_typ)/(precision_typ + recall_typ))
print('Precision : ' + str(precision_typ))
print('Recall : ' + str(recall_typ))
print('F1 : ' + str(f1_typ))
print('O')
precision_o = o_r/o_g
recall_o = o_r/o
f1_o = 2 * ((precision_o * recall_o)/(precision_o + recall_o))
print('Precision : ' + str(precision_o))
print('Recall : ' + str(recall_o))
print('F1 : ' + str(f1_o))
print('Overall Without O')
total = pro + typ + bra
precision = ((pro * precision_pro) + (bra * precision_bra) + (typ * precision_typ)) / total
recall = ((pro * recall_pro) + (bra * recall_bra) + (typ * recall_typ)) / total
f1 = ((pro * f1_pro) + (bra * f1_bra) + (typ * f1_typ)) / total
print('Precision : ' + str(precision))
print('Recall : ' + str(recall))
print('F1 : ' + str(f1))
print('Overall With O')
total = pro + typ + bra + o
precision = ((pro * precision_pro) + (bra * precision_bra) + (typ * precision_typ) + (o * precision_o)) / total
recall = ((pro * recall_pro) + (bra * recall_bra) + (typ * recall_typ) + (o * recall_o)) / total
f1 = ((pro * f1_pro) + (bra * f1_bra) + (typ * f1_typ) + (o * f1_o)) / total
print('Precision : ' + str(precision))
print('Recall : ' + str(recall))
print('F1 : ' + str(f1))

In [None]:
f = open('labeled_automatically (4).tsv','w', encoding='utf-8') 
for i in range(len(label_for_unlabeled)):
    for j in range(len(label_for_unlabeled[i][0])):
        f.write(label_for_unlabeled[i][0][j])
        f.write('\t')
        f.write(label_for_unlabeled[i][1][j])
        f.write('\n')
        #print(label_for_unlabeled[i][j])
        #print(label_for_unlabeled[i + 1][j])
    f.write('\n')
f.write('\n')
f.close()