In [1]:
import csv
import nltk
import unicodedata
import numpy as np
from weighted_levenshtein import lev, osa, dam_lev
from string import ascii_lowercase
from copy import deepcopy
import json
import re
from collections import Counter
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import sequence_accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

In [None]:
#read brand and brand abbreviation for the edit distance
brand = []
with open('brand.csv', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        brand.append(unicodedata.normalize('NFKD', row[0]).encode('ascii','ignore'))
#print(brand)

brand_abb = []
with open('brand_singkatan.csv', newline='', encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        brand_abb.append(unicodedata.normalize('NFKD', row[0]).encode('ascii','ignore'))
#print(brand_abb)

In [None]:
#the edit distance function
alfha = 0.4

insert_costs = np.full(128, 100, dtype=np.float64)
insert_costs[ord('-')] = 10
insert_costs[ord(' ')] = 10

delete_costs = np.full(128, 100, dtype=np.float64)
delete_costs[ord('-')] = 10
delete_costs[ord(' ')] = 10

substitute_costs = np.full((128,128), 50, dtype=np.float64)
for c in ascii_lowercase:
    substitute_costs[ord(c), ord(c.capitalize())] = 10
    substitute_costs[ord(c), ord(c)] = 0
    substitute_costs[ord(c.capitalize()), ord(c)] = 10
    substitute_costs[ord(c.capitalize()), ord(c.capitalize())] = 0
substitute_costs[ord('-'), ord(' ')] = 10
substitute_costs[ord(' '), ord('-')] = 10
for i in range(10):
    for j in range(10):
        if i == j:
            substitute_costs[ord(str(i)), ord(str(j))] = 0
            substitute_costs[ord(str(j)), ord(str(i))] = 0
        else:
            substitute_costs[ord(str(i)), ord(str(j))] = 10
            substitute_costs[ord(str(j)), ord(str(i))] = 10

def edit_distance_normalized_cost(word, target):
    cost = lev(word, target, insert_costs=insert_costs, delete_costs=delete_costs, substitute_costs=substitute_costs)
    return (cost + alfha) / len(target)

def check_under_threshold(cost, threshold):
    if cost <= threshold:
        return True
    else:
        return False

def check_edit_distance_brand(sentence, pos):
    threshold = 15
    words = sentence.split()
    candidate = []
    candidate.append(words[pos])
    if pos >= 0:
        if pos < (len(words) - 1):
            candidate.append(words[pos] + " " + words[pos + 1])
        #if pos < (len(words) - 2):
        #      candidate.append(words[pos] + " " + words[pos + 1] + " " + words[pos + 2])
    if (pos - 1) >= 0:
        candidate.append(words[pos - 1] + " " + words[pos])
        if pos < (len(words) - 1):
            candidate.append(words[pos - 1] + " " + words[pos] + " " + words[pos + 1])
        #if pos < (len(words) - 2):
        #    candidate.append(words[pos - 1] + " " + words[pos] + " " + words[pos + 1] + " " + words[pos + 2])
    #if (pos - 2) >= 0:
    #    candidate.append(words[pos - 2] + " " + words[pos - 1] + " " + words[pos])
        #if pos < (len(words) - 1):
        #    candidate.append(words[pos - 2] + " " + words[pos - 1] + " " + words[pos] + " " + words[pos + 1])
        #if pos < (len(words) - 2):
        #    candidate.append(words[pos - 2] + " " + words[pos - 1] + " " + words[pos] + " " + words[pos + 1] + " " + words[pos + 2])
    candidate.sort(key = lambda s: len(s))
    exist = False
    for c in candidate:
        for b in brand:
            zzzz = unicodedata.normalize('NFKD', c).encode('ascii','ignore')
            if check_under_threshold(edit_distance_normalized_cost(zzzz,b),threshold):
                exist = True
                break
    return exist
#print(check_edit_distance_brand('Acquarella',0))

def check_edit_distance_brand_abb(word):
    threshold = 5
    exist = False
    for b in brand_abb:
        zzzz = unicodedata.normalize('NFKD', word).encode('ascii','ignore')
        if check_under_threshold(edit_distance_normalized_cost(zzzz,b),threshold) :
            exist = True
            break
    return exist
#print(check_edit_distance_brand_abb('Bb'))

common_token_before = ['pake','pakai','ama','sama','yang','yg','si','dari','dr','merk','pakek','pk','coba','merek','brand','by']
def check_edit_distance_common(word):
    threshold = 5
    exist = False
    for b in common_token_before:
        zzzz = unicodedata.normalize('NFKD', word).encode('ascii','ignore')
        if check_under_threshold(edit_distance_normalized_cost(zzzz,b),threshold) :
            exist = True
            break
    return exist
#print(check_edit_distance_common('dr'))
def list_indication(word):
    if bool(re.match(r"[0-9].", word)):
        return True
    elif word == '-' or word == ',' or word == 'dan' or word == '&' or word == 'and' or word == '*' or word =='+':
        return True
    else:
        return False
    
thing_in_pro = ['shampoo', 'conditioner', 'volumizer cream', 'cat rambut', 'serum', 'lipstik', 'lipstick', 'l/p', 'bb cream', 'eyeliner', 'mascara', 'foundation', 'foundi', 'brush', 'brushes', 'pressed powder', 'eyeshadow cream', 'liquid liner', 'palette', 'bronzer', 'blush', 'eyeshadow', 'liquid lipstick', 'concealer', 'lipbalm', 'cheek stain', 'compact powder', 'powder', 'lip stain', 'lip butter', 'lip velvet', 'baby oil', 'overnight serum', 'toner', 'face wash', 'lotion', 'uv milk', 'cleansing oil', 'cleanser', 'cream', 'remover', 'essence', 'moist', 'moisturizer', 'facial wash', 'facial foam', 'mask sheet', 'brightening foam', 'make up remover', 'sabun', 'night cream', 'pelembab', 'lulur', 'skin food', 'sunblock', 'facial mask', 'peel off', 'remover', 'gel eyeliner', 'aminexil', 'curling oil', 'sisir', 'hair color', 'hairspray', 'swatch', 'e/s', 'e/l', 'l/s', 'eyelash curler', 'lip glaze', 'fluidline', 'bedak', 'deodorants', 'pensil alis', 'petroleum jelly', 'milk cleanser', 'nail polish', 'cleansing foam', 'cleansing milk', 'lipnicure', 'lip liner', 'lip velvet', 'dupe', 'loose powder', 'bbc', 'bb', 'tinted moisturizer','TM', 'eye cream', 'spray', 'face spray', 'facial mist', 'air mawar', 'lip filler', 'cleansing balm', 'oil', 'c/o', 'clay mask', 'sunscreen', 'moisturizing lotion', 'lip scrub', 'skin conditioner', 'lip therapy', 'mousse', 'seri','line','range', 'erase paste', 'tweezer', 'blush on', 'baby powder', 'bath soap', 'hydrosol', 'mask', 'wipes', 'masker']
def check_edit_distance_thing(sentence, pos):
    threshold = 10
    words = sentence.split()
    candidate = []
    candidate.append(words[pos])
    if pos >= 0:
        if pos < (len(words) - 1):
            candidate.append(words[pos] + " " + words[pos + 1])
    if (pos - 1) >= 0:
        candidate.append(words[pos - 1] + " " + words[pos])
        if pos < (len(words) - 1):
            candidate.append(words[pos - 1] + " " + words[pos] + " " + words[pos + 1])
    candidate.sort(key = lambda s: len(s))
    exist = False
    for c in candidate:
        for b in thing_in_pro:
            zzzz = unicodedata.normalize('NFKD', c).encode('ascii','ignore')
            if check_under_threshold(edit_distance_normalized_cost(zzzz,b),threshold):
                exist = True
                break
    return exist
#print(check_edit_distance_thing('shampoo batangan',0))

In [None]:
#read unlabeled data and tokenize it
unlabeled = []
with open("unlabeled.txt", encoding='utf-8') as fd:
    for line in fd:
        sentence = line
        tokens = nltk.tokenize.word_tokenize(sentence)
        unlabeled.append(tokens)
#print(unlabeled[2])
#json.dump(unlabeled, open("unlabeled_tokenized.txt",'w'))

In [2]:
#read dataTrain and dataTest
dataTrain = []
with open("dataTrain_!O.tsv", encoding='utf-8') as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    sentence = []
    for row in rd:
        if not row:
            dataTrain.append(sentence)
            sentence = []
        else:
            sentence.append(row)

dataTest = []
with open("dataTest_!O.tsv", encoding='utf-8') as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    sentence = []
    for row in rd:
        if not row:
            dataTest.append(sentence)
            sentence = []
        else:
            sentence.append(row)
dataTrain = list(filter(None, dataTrain))            
dataTest = list(filter(None, dataTest))  
#print(dataTrain[0])

In [3]:
#feature extraction
def word2features(sent, i):
    word = sent[i][0]
    #postag = sent[i][1]
    sentence = ''
    for w in sent:
        sentence += w[0] + " "
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        #'word.onThingList': check_edit_distance_thing(sentence,i),
        'word.indicateList': list_indication(word),
        #'word.onCommonList': check_edit_distance_common(word),
        #'word.onList': check_edit_distance_brand(sentence,i), #on list brand
        #'word.onListAbb': check_edit_distance_brand_abb(word) #on list brand abbreviation
        #'postag': postag,
        #'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        #postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'-1:word.onThingList': check_edit_distance_thing(sentence,(i - 1)),
            #'-1:word.indicateList': list_indication(word1),
            #'-1:word.onCommonList': check_edit_distance_common(word1),
            #'-1:onList': check_edit_distance_brand(sentence,(i - 1)),
            #'-1:postag': postag1,
            #'-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    '''
    if i > 1:
        word1 = sent[i-2][0]
        #postag1 = sent[i-1][1]
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:word.istitle()': word1.istitle(),
            '-2:word.isupper()': word1.isupper(),
        })
    if i < len(sent)-2:
        word1 = sent[i+2][0]
        #postag1 = sent[i+1][1]
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:word.istitle()': word1.istitle(),
            '+2:word.isupper()': word1.isupper(),
        })
    '''
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        #postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            #'+1:word.onThingList': check_edit_distance_thing(sentence,(i + 1)),
            #'+1:word.indicateList': list_indication(word1),
            #'+1:word.onCommonList': check_edit_distance_common(word1),
            #'+1:postag': postag1,
            #'+1:postag[:2]': postag1[:2],
            #'+1:onList': check_edit_distance_brand(sentence,(i + 1)),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [None]:
#extract features and label from data test and train
X_train = [sent2features(s) for s in dataTrain]
y_train = [sent2labels(s) for s in dataTrain]

X_test = [sent2features(s) for s in dataTest]
y_test = [sent2labels(s) for s in dataTest]
#print(X_train[0][1])

In [None]:
#add previous extracted features
X_train_read = json.load(open("var/2/X_train_!O.txt"))
for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        X_train[i][j]['word.onList'] = X_train_read[i][j]['word.onList']
        X_train[i][j]['word.onListAbb'] = X_train_read[i][j]['word.onListAbb']
        X_train[i][j]['word.onThingList'] = X_train_read[i][j]['word.onThingList']
        X_train[i][j]['word.onCommonList'] = X_train_read[i][j]['word.onCommonList']
X_test_read = json.load(open("var/2/X_test_!O.txt"))
for i in range(len(X_test)):
    for j in range(len(X_test[i])):
        X_test[i][j]['word.onList'] = X_test_read[i][j]['word.onList']
        X_test[i][j]['word.onListAbb'] = X_test_read[i][j]['word.onListAbb']
        X_test[i][j]['word.onThingList'] = X_test_read[i][j]['word.onThingList']
        X_test[i][j]['word.onCommonList'] = X_test_read[i][j]['word.onCommonList']
#print(X_train[0][1])

In [4]:
#load saved feature and label if don't need (or want) to re-extraxt the feature and label
X_train = json.load(open("var/2/X_train_!O.txt"))
#y_train = json.load(open("var/2/y_train_!O_updated.txt"))
y_train = [sent2labels(s) for s in dataTrain]
X_test = json.load(open("var/2/X_test_!O.txt"))
#y_test = json.load(open("var/2/y_test_!O_updated.txt"))
y_test = [sent2labels(s) for s in dataTest]
X_unlabeled = json.load(open("var/2/X_unlabeled_new.txt"))
unlabeled = json.load(open("var/2/unlabeled_tokenized_new.txt"))

In [6]:
for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        X_train[i][j].pop('word.onList')
        X_train[i][j].pop('word.onListAbb')
for i in range(len(X_test)):
    for j in range(len(X_test[i])):
        X_test[i][j].pop('word.onList')
        X_test[i][j].pop('word.onListAbb')

In [8]:
#add joint features & neighbour's features
for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        if j > 0:
            X_train[i][j]['jointFeatures'] = []
            #if X_train[i][j - 1]['word.onThingList'] == True and (X_train[i][j]['word.onList'] == True or X_train[i][j]['word.onListAbb'] == True):
            #    X_train[i][j]['jointFeatures'].append('Thing+Brand')
            #if X_train[i][j]['word.onThingList'] == True and (X_train[i][j - 1]['word.onList'] == True or X_train[i][j - 1]['word.onListAbb'] == True):
            #    X_train[i][j]['jointFeatures'].append('Brand+Thing')
            #if X_train[i][j - 1]['word.onCommonList'] == True and (X_train[i][j]['word.onList'] == True or X_train[i][j]['word.onListAbb'] == True):
            #    X_train[i][j]['jointFeatures'].append('Common+Brand')
            #if X_train[i][j]['word.onCommonList'] == True and (X_train[i][j - 1]['word.onList'] == True or X_train[i][j - 1]['word.onListAbb'] == True):
            #    X_train[i][j]['jointFeatures'].append('Brand+Common')
            #if X_train[i][j - 1]['word.indicateList'] == True and (X_train[i][j]['word.onList'] == True or X_train[i][j]['word.onListAbb'] == True):
            #    X_train[i][j]['jointFeatures'].append('Indication+Brand')
            #if X_train[i][j]['word.indicateList'] == True and (X_train[i][j - 1]['word.onList'] == True or X_train[i][j - 1]['word.onListAbb'] == True):
            #    X_train[i][j]['jointFeatures'].append('Brand+Indication')
            if X_train[i][j - 1]['word.onThingList'] == True and X_train[i][j]['word.indicateList'] == True:
                X_train[i][j]['jointFeatures'].append('Thing+Indication')
            if X_train[i][j]['word.onThingList'] == True and X_train[i][j - 1]['word.indicateList'] == True:
                X_train[i][j]['jointFeatures'].append('Indication+Thing')
            if X_train[i][j - 1]['word.onThingList'] == True and X_train[i][j]['word.onCommonList'] == True:
                X_train[i][j]['jointFeatures'].append('Thing+Common')
            if X_train[i][j]['word.onThingList'] == True and X_train[i][j - 1]['word.onCommonList'] == True:
                X_train[i][j]['jointFeatures'].append('Common+Thing')
            if X_train[i][j - 1]['word.indicateList'] == True and X_train[i][j]['word.onCommonList'] == True:
                X_train[i][j]['jointFeatures'].append('Indication+Common')
            if X_train[i][j]['word.indicateList'] == True and X_train[i][j - 1]['word.onCommonList'] == True:
                X_train[i][j]['jointFeatures'].append('Common+Indication')
            '''
            if X_train[i][j - 1]['word.onCommonList'] == True and X_train[i][j]['word.isupper()'] == True:
                X_train[i][j]['jointFeatures'].append('Common+Upper')
            if X_train[i][j - 1]['word.onCommonList'] == True and X_train[i][j]['word.istitle()'] == True:
                X_train[i][j]['jointFeatures'].append('Common+Title')
            if X_train[i][j - 1]['word.indicateList'] == True and X_train[i][j]['word.isupper()'] == True:
                X_train[i][j]['jointFeatures'].append('Indication+Upper')
            if X_train[i][j - 1]['word.indicateList'] == True and X_train[i][j]['word.istitle()'] == True:
                X_train[i][j]['jointFeatures'].append('Indication+Title')
            if X_train[i][j - 1]['word.isupper()'] == True and (X_train[i][j]['word.onList'] == True or X_train[i][j]['word.onListAbb'] == True):
                X_train[i][j]['jointFeatures'].append('Upper+Brand')
            if X_train[i][j]['word.isupper()'] == True and (X_train[i][j - 1]['word.onList'] == True or X_train[i][j - 1]['word.onListAbb'] == True):
                X_train[i][j]['jointFeatures'].append('Brand+Upper')
            if X_train[i][j - 1]['word.istitle()'] == True and (X_train[i][j]['word.onList'] == True or X_train[i][j]['word.onListAbb'] == True):
                X_train[i][j]['jointFeatures'].append('Title+Brand')
            if X_train[i][j]['word.istitle()'] == True and (X_train[i][j - 1]['word.onList'] == True or X_train[i][j - 1]['word.onListAbb'] == True):
                X_train[i][j]['jointFeatures'].append('Brand+Title')
            if X_train[i][j - 1]['word.isupper()'] == True and X_train[i][j]['word.onThingList'] == True:
                X_train[i][j]['jointFeatures'].append('Upper+Thing')
            if X_train[i][j]['word.isupper()'] == True and X_train[i][j - 1]['word.onThingList'] == True:
                X_train[i][j]['jointFeatures'].append('Thing+Upper')
            if X_train[i][j - 1]['word.istitle()'] == True and X_train[i][j]['word.onThingList'] == True:
                X_train[i][j]['jointFeatures'].append('Title+Thing')
            if X_train[i][j]['word.istitle()'] == True and X_train[i][j - 1]['word.onThingList'] == True:
                X_train[i][j]['jointFeatures'].append('Thing+Title')
            '''
        if j >= 0:
            if j < (len(X_train[i]) - 1):
                #if (X_train[i][j + 1]['word.onList'] == True or X_train[i][j + 1]['word.onListAbb'] == True):
                #    X_train[i][j]['+1:neigbourFeatures'] = 'Brand'
                if X_train[i][j + 1]['word.onThingList'] == True:
                    X_train[i][j]['+1:neigbourFeatures'] = 'Thing'
                elif X_train[i][j + 1]['word.indicateList'] == True:
                    X_train[i][j]['+1:neigbourFeatures'] = 'Indicate'
                elif X_train[i][j + 1]['word.onCommonList'] == True:
                    X_train[i][j]['+1:neigbourFeatures'] = 'Common'
            if j < (len(X_train[i]) - 2):
                #if (X_train[i][j + 2]['word.onList'] == True or X_train[i][j + 2]['word.onListAbb'] == True):
                #    X_train[i][j]['+2:neigbourFeatures'] = 'Brand'
                if X_train[i][j + 2]['word.onThingList'] == True:
                    X_train[i][j]['+2:neigbourFeatures'] = 'Thing'
                elif X_train[i][j + 2]['word.indicateList'] == True:
                    X_train[i][j]['+2:neigbourFeatures'] = 'Indicate'
                elif X_train[i][j + 2]['word.onCommonList'] == True:
                    X_train[i][j]['+2:neigbourFeatures'] = 'Common'
        if (j - 1) >= 0:
            #if (X_train[i][j - 1]['word.onList'] == True or X_train[i][j - 1]['word.onListAbb'] == True):
            #    X_train[i][j]['-1:neigbourFeatures'] = 'Brand'
            if X_train[i][j - 1]['word.onThingList'] == True:
                X_train[i][j]['-1:neigbourFeatures'] = 'Thing'
            elif X_train[i][j - 1]['word.indicateList'] == True:
                X_train[i][j]['-1:neigbourFeatures'] = 'Indicate'
            elif X_train[i][j - 1]['word.onCommonList'] == True:
                X_train[i][j]['-1:neigbourFeatures'] = 'Common'
        if (j - 2) >= 0:
            #if (X_train[i][j - 2]['word.onList'] == True or X_train[i][j - 2]['word.onListAbb'] == True):
            #    X_train[i][j]['-2:neigbourFeatures'] = 'Brand'
            if X_train[i][j - 2]['word.onThingList'] == True:
                X_train[i][j]['-2:neigbourFeatures'] = 'Thing'
            elif X_train[i][j - 2]['word.indicateList'] == True:
                X_train[i][j]['-2:neigbourFeatures'] = 'Indicate'
            elif X_train[i][j - 2]['word.onCommonList'] == True:
                X_train[i][j]['-2:neigbourFeatures'] = 'Common'
for i in range(len(X_test)):
    for j in range(len(X_test[i])):        
        if j > 0:
            X_test[i][j]['jointFeatures'] = []
            #if X_test[i][j - 1]['word.onThingList'] == True and (X_test[i][j]['word.onList'] == True or X_test[i][j]['word.onListAbb'] == True):
            #    X_test[i][j]['jointFeatures'].append('Thing+Brand')
            #if X_test[i][j]['word.onThingList'] == True and (X_test[i][j - 1]['word.onList'] == True or X_test[i][j - 1]['word.onListAbb'] == True):
            #    X_test[i][j]['jointFeatures'].append('Brand+Thing')
            #if X_test[i][j - 1]['word.onCommonList'] == True and (X_test[i][j]['word.onList'] == True or X_test[i][j]['word.onListAbb'] == True):
            #    X_test[i][j]['jointFeatures'].append('Common+Brand')
            #if X_test[i][j]['word.onCommonList'] == True and (X_test[i][j - 1]['word.onList'] == True or X_test[i][j - 1]['word.onListAbb'] == True):
            #    X_test[i][j]['jointFeatures'].append('Brand+Common')
            #if X_test[i][j - 1]['word.indicateList'] == True and (X_test[i][j]['word.onList'] == True or X_test[i][j]['word.onListAbb'] == True):
            #    X_test[i][j]['jointFeatures'].append('Indication+Brand')
            #if X_test[i][j]['word.indicateList'] == True and (X_test[i][j - 1]['word.onList'] == True or X_test[i][j - 1]['word.onListAbb'] == True):
            #    X_test[i][j]['jointFeatures'].append('Brand+Indication')
            if X_test[i][j - 1]['word.onThingList'] == True and X_test[i][j]['word.indicateList'] == True:
                X_test[i][j]['jointFeatures'].append('Thing+Indication')
            if X_test[i][j]['word.onThingList'] == True and X_test[i][j - 1]['word.indicateList'] == True:
                X_test[i][j]['jointFeatures'].append('Indication+Thing')
            if X_test[i][j - 1]['word.onThingList'] == True and X_test[i][j]['word.onCommonList'] == True:
                X_test[i][j]['jointFeatures'].append('Thing+Common')
            if X_test[i][j]['word.onThingList'] == True and X_test[i][j - 1]['word.onCommonList'] == True:
                X_test[i][j]['jointFeatures'].append('Common+Thing')
            if X_test[i][j - 1]['word.indicateList'] == True and X_test[i][j]['word.onCommonList'] == True:
                X_test[i][j]['jointFeatures'].append('Indication+Common')
            if X_test[i][j]['word.indicateList'] == True and X_test[i][j - 1]['word.onCommonList'] == True:
                X_test[i][j]['jointFeatures'].append('Common+Indication')    
            '''
            if X_test[i][j - 1]['word.onCommonList'] == True and X_test[i][j]['word.isupper()'] == True:
                X_test[i][j]['jointFeatures'].append('Common+Upper')
            if X_test[i][j - 1]['word.onCommonList'] == True and X_test[i][j]['word.istitle()'] == True:
                X_test[i][j]['jointFeatures'].append('Common+Title')
            if X_test[i][j - 1]['word.indicateList'] == True and X_test[i][j]['word.isupper()'] == True:
                X_test[i][j]['jointFeatures'].append('Indication+Upper')
            if X_test[i][j - 1]['word.indicateList'] == True and X_test[i][j]['word.istitle()'] == True:
                X_test[i][j]['jointFeatures'].append('Indication+Title')
            if X_test[i][j - 1]['word.isupper()'] == True and (X_test[i][j]['word.onList'] == True or X_test[i][j]['word.onListAbb'] == True):
                X_test[i][j]['jointFeatures'].append('Upper+Brand')
            if X_test[i][j]['word.isupper()'] == True and (X_test[i][j - 1]['word.onList'] == True or X_test[i][j - 1]['word.onListAbb'] == True):
                X_test[i][j]['jointFeatures'].append('Brand+Upper')
            if X_test[i][j - 1]['word.istitle()'] == True and (X_test[i][j]['word.onList'] == True or X_test[i][j]['word.onListAbb'] == True):
                X_test[i][j]['jointFeatures'].append('Title+Brand')
            if X_test[i][j]['word.istitle()'] == True and (X_test[i][j - 1]['word.onList'] == True or X_test[i][j - 1]['word.onListAbb'] == True):
                X_test[i][j]['jointFeatures'].append('Brand+Title')
            if X_test[i][j - 1]['word.isupper()'] == True and X_test[i][j]['word.onThingList'] == True:
                X_test[i][j]['jointFeatures'].append('Upper+Thing')
            if X_test[i][j]['word.isupper()'] == True and X_test[i][j - 1]['word.onThingList'] == True:
                X_test[i][j]['jointFeatures'].append('Thing+Upper')
            if X_test[i][j - 1]['word.istitle()'] == True and X_test[i][j]['word.onThingList'] == True:
                X_test[i][j]['jointFeatures'].append('Title+Thing')
            if X_test[i][j]['word.istitle()'] == True and X_test[i][j - 1]['word.onThingList'] == True:
                X_test[i][j]['jointFeatures'].append('Thing+Title')
            '''
        if j >= 0:
            if j < (len(X_test[i]) - 1):
                #if (X_test[i][j + 1]['word.onList'] == True or X_test[i][j + 1]['word.onListAbb'] == True):
                #    X_test[i][j]['+1:neigbourFeatures'] = 'Brand'
                if X_test[i][j + 1]['word.onThingList'] == True:
                    X_test[i][j]['+1:neigbourFeatures'] = 'Thing'
                elif X_test[i][j + 1]['word.indicateList'] == True:
                    X_test[i][j]['+1:neigbourFeatures'] = 'Indicate'
                elif X_test[i][j + 1]['word.onCommonList'] == True:
                    X_test[i][j]['+1:neigbourFeatures'] = 'Common'
            if j < (len(X_test[i]) - 2):
                #if (X_test[i][j + 2]['word.onList'] == True or X_test[i][j + 2]['word.onListAbb'] == True):
                #    X_test[i][j]['+2:neigbourFeatures'] = 'Brand'
                if X_test[i][j + 2]['word.onThingList'] == True:
                    X_test[i][j]['+2:neigbourFeatures'] = 'Thing'
                elif X_test[i][j + 2]['word.indicateList'] == True:
                    X_test[i][j]['+2:neigbourFeatures'] = 'Indicate'
                elif X_test[i][j + 2]['word.onCommonList'] == True:
                    X_test[i][j]['+2:neigbourFeatures'] = 'Common'
        if (j - 1) >= 0:
            #if (X_test[i][j - 1]['word.onList'] == True or X_test[i][j - 1]['word.onListAbb'] == True):
            #    X_test[i][j]['-1:neigbourFeatures'] = 'Brand'
            if X_test[i][j - 1]['word.onThingList'] == True:
                X_test[i][j]['-1:neigbourFeatures'] = 'Thing'
            elif X_test[i][j - 1]['word.indicateList'] == True:
                X_test[i][j]['-1:neigbourFeatures'] = 'Indicate'
            elif X_test[i][j - 1]['word.onCommonList'] == True:
                X_test[i][j]['-1:neigbourFeatures'] = 'Common'
                
        if (j - 2) >= 0:
            #if (X_test[i][j - 2]['word.onList'] == True or X_test[i][j - 2]['word.onListAbb'] == True):
            #    X_test[i][j]['-2:neigbourFeatures'] = 'Brand'
            if X_test[i][j - 2]['word.onThingList'] == True:
                X_test[i][j]['-2:neigbourFeatures'] = 'Thing'
            elif X_test[i][j - 2]['word.indicateList'] == True:
                X_test[i][j]['-2:neigbourFeatures'] = 'Indicate'
            elif X_test[i][j - 2]['word.onCommonList'] == True:
                X_test[i][j]['-2:neigbourFeatures'] = 'Common'

In [None]:
#extract features from unlabeled data
X_unlabeled = [sent2features(s) for s in unlabeled]
json.dump(X_unlabeled, open("X_unlabeled.txt",'w'))

In [4]:
#extract newly added features
#read & extract from og (original from first experiment, cuz the extracted features are from there too) file
#unlabeled = json.load(open("unlabeled_tokenized_og.txt"))
#X_unlabeled = [sent2features(s) for s in unlabeled]
#re-add the brand list feature
'''
unlabeled = json.load(open("var/1/unlabeled_tokenized.txt"))
X_unlabeled = [sent2features(s) for s in unlabeled]
X_unlabeled_read = json.load(open("var/1/X_unlabeled.txt"))
for i in range(len(X_unlabeled)):
    for j in range(len(X_unlabeled[i])):
        X_unlabeled[i][j]['word.onList'] = X_unlabeled_read[i][j]['word.onList']
        X_unlabeled[i][j]['word.onListAbb'] = X_unlabeled_read[i][j]['word.onListAbb']
        X_unlabeled[i][j]['word.onThingList'] = X_unlabeled_read[i][j]['word.onThingList']
        X_unlabeled[i][j]['word.onCommonList'] = X_unlabeled_read[i][j]['word.onCommonList']
'''
#add joint features & neighbour's features
for i in range(len(X_unlabeled)):
    for j in range(len(X_unlabeled[i])):
        if j > 0:
            X_unlabeled[i][j]['jointFeatures'] = []
            if X_unlabeled[i][j - 1]['word.onThingList'] == True and (X_unlabeled[i][j]['word.onList'] == True or X_unlabeled[i][j]['word.onListAbb'] == True):
                X_unlabeled[i][j]['jointFeatures'].append('Thing+Brand')
            if X_unlabeled[i][j]['word.onThingList'] == True and (X_unlabeled[i][j - 1]['word.onList'] == True or X_unlabeled[i][j - 1]['word.onListAbb'] == True):
                X_unlabeled[i][j]['jointFeatures'].append('Brand+Thing')
            if X_unlabeled[i][j - 1]['word.onCommonList'] == True and (X_unlabeled[i][j]['word.onList'] == True or X_unlabeled[i][j]['word.onListAbb'] == True):
                X_unlabeled[i][j]['jointFeatures'].append('Common+Brand')
            if X_unlabeled[i][j]['word.onCommonList'] == True and (X_unlabeled[i][j - 1]['word.onList'] == True or X_unlabeled[i][j - 1]['word.onListAbb'] == True):
                X_unlabeled[i][j]['jointFeatures'].append('Brand+Common')
            if X_unlabeled[i][j - 1]['word.indicateList'] == True and (X_unlabeled[i][j]['word.onList'] == True or X_unlabeled[i][j]['word.onListAbb'] == True):
                X_unlabeled[i][j]['jointFeatures'].append('Indication+Brand')
            if X_unlabeled[i][j]['word.indicateList'] == True and (X_unlabeled[i][j - 1]['word.onList'] == True or X_unlabeled[i][j - 1]['word.onListAbb'] == True):
                X_unlabeled[i][j]['jointFeatures'].append('Brand+Indication')
            if X_unlabeled[i][j - 1]['word.onThingList'] == True and X_unlabeled[i][j]['word.indicateList'] == True:
                X_unlabeled[i][j]['jointFeatures'].append('Thing+Indication')
            if X_unlabeled[i][j]['word.onThingList'] == True and X_unlabeled[i][j - 1]['word.indicateList'] == True:
                X_unlabeled[i][j]['jointFeatures'].append('Indication+Thing')
            if X_unlabeled[i][j - 1]['word.onThingList'] == True and X_unlabeled[i][j]['word.onCommonList'] == True:
                X_unlabeled[i][j]['jointFeatures'].append('Thing+Common')
            if X_unlabeled[i][j]['word.onThingList'] == True and X_unlabeled[i][j - 1]['word.onCommonList'] == True:
                X_unlabeled[i][j]['jointFeatures'].append('Common+Thing')
            if X_unlabeled[i][j - 1]['word.indicateList'] == True and X_unlabeled[i][j]['word.onCommonList'] == True:
                X_unlabeled[i][j]['jointFeatures'].append('Indication+Common')
            if X_unlabeled[i][j]['word.indicateList'] == True and X_unlabeled[i][j - 1]['word.onCommonList'] == True:
                X_unlabeled[i][j]['jointFeatures'].append('Common+Indication')    
        if j >= 0:
            if j < (len(X_unlabeled[i]) - 1):
                if (X_unlabeled[i][j + 1]['word.onList'] == True or X_unlabeled[i][j + 1]['word.onListAbb'] == True):
                    X_unlabeled[i][j]['+1:neigbourFeatures'] = 'Brand'
                elif X_unlabeled[i][j + 1]['word.onThingList'] == True:
                    X_unlabeled[i][j]['+1:neigbourFeatures'] = 'Thing'
                elif X_unlabeled[i][j + 1]['word.indicateList'] == True:
                    X_unlabeled[i][j]['+1:neigbourFeatures'] = 'Indicate'
                elif X_unlabeled[i][j + 1]['word.onCommonList'] == True:
                    X_unlabeled[i][j]['+1:neigbourFeatures'] = 'Common'
            if j < (len(X_unlabeled[i]) - 2):
                if (X_unlabeled[i][j + 2]['word.onList'] == True or X_unlabeled[i][j + 2]['word.onListAbb'] == True):
                    X_unlabeled[i][j]['+2:neigbourFeatures'] = 'Brand'
                elif X_unlabeled[i][j + 2]['word.onThingList'] == True:
                    X_unlabeled[i][j]['+2:neigbourFeatures'] = 'Thing'
                elif X_unlabeled[i][j + 2]['word.indicateList'] == True:
                    X_unlabeled[i][j]['+2:neigbourFeatures'] = 'Indicate'
                elif X_unlabeled[i][j + 2]['word.onCommonList'] == True:
                    X_unlabeled[i][j]['+2:neigbourFeatures'] = 'Common'
        if (j - 1) >= 0:
            if (X_unlabeled[i][j - 1]['word.onList'] == True or X_unlabeled[i][j - 1]['word.onListAbb'] == True):
                X_unlabeled[i][j]['-1:neigbourFeatures'] = 'Brand'
            elif X_unlabeled[i][j - 1]['word.onThingList'] == True:
                X_unlabeled[i][j]['-1:neigbourFeatures'] = 'Thing'
            elif X_unlabeled[i][j - 1]['word.indicateList'] == True:
                X_unlabeled[i][j]['-1:neigbourFeatures'] = 'Indicate'
            elif X_unlabeled[i][j - 1]['word.onCommonList'] == True:
                X_unlabeled[i][j]['-1:neigbourFeatures'] = 'Common'
                
        if (j - 2) >= 0:
            if (X_unlabeled[i][j - 2]['word.onList'] == True or X_unlabeled[i][j - 2]['word.onListAbb'] == True):
                X_unlabeled[i][j]['-2:neigbourFeatures'] = 'Brand'
            elif X_unlabeled[i][j - 2]['word.onThingList'] == True:
                X_unlabeled[i][j]['-2:neigbourFeatures'] = 'Thing'
            elif X_unlabeled[i][j - 2]['word.indicateList'] == True:
                X_unlabeled[i][j]['-2:neigbourFeatures'] = 'Indicate'
            elif X_unlabeled[i][j - 2]['word.onCommonList'] == True:
                X_unlabeled[i][j]['-2:neigbourFeatures'] = 'Common'
#json.dump(X_unlabeled, open("X_unlabeled.txt",'w'))

In [5]:
#save extracted feature and label to a file to avoid re-extracting the same thing
#json.dump(X_train, open("X_train.txt",'w'))
#json.dump(y_train, open("y_train.txt",'w'))
#json.dump(X_test, open("X_test.txt",'w'))
#json.dump(y_test, open("y_test.txt",'w'))
json.dump(X_unlabeled, open("X_unlabeled_new.txt",'w'))

In [None]:
#find optimum param for training
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
#print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [17]:
#create crf object and train it with data train
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,#0.23816962106578055,#0.1
    c2=0.1,#0.0104234047496269,#0.1
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [None]:
#per label evaluation
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
labels.remove('O') # remove 'O' label from evaluation
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

In [56]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(1000))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.445782 O        BOS
5.128899 B-PRO|B-BRA BOS
4.200890 B-PRO    BOS
4.008197 B-BRA    BOS
3.689991 B-PRO|B-TYP BOS
3.592484 B-PRO    word.lower():udpp
3.527133 O        word.lower():nude
3.485149 B-PRO    word.lower():lipbalmnya
3.485149 B-PRO    -1:word.lower():thebathbox
3.398500 B-BRA    word.lower():nizoral
3.212500 O        word.lower():itu
3.206125 I-PRO|B-TYP -1:word.lower():loreal
3.136639 I-PRO|B-BRA -1:word.lower():nya
2.986771 B-PRO    word.onThingList
2.951717 O        word[-2:]:ah
2.853221 B-TYP    word.lower():regenerist
2.842527 B-BRA    word.lower():etude
2.791501 I-PRO|I-BRA -1:word.lower():la
2.790271 B-BRA    word.onListAbb
2.787301 I-PRO|B-BRA word.onListAbb
2.756773 O        word.lower():sample
2.747935 I-PRO    word.lower():nya
2.747802 B-BRA    word.lower():phytomer
2.743901 B-TYP    word.lower():anr
2.695201 B-BRA    word.lower():bodyshop
2.683184 O        word[-2:]:ga
2.676737 I-PRO|B-BRA word.onList
2.667544 O        word.lower():nya
2.660852 B-

1.218488 O        word.lower():so
1.216293 O        word[-3:]:I
1.216293 O        word[-2:]:I
1.215961 B-TYP    word[-2:]:ac
1.215730 I-PRO|B-BRA word.lower():gnc
1.215730 I-PRO|B-BRA word[-3:]:GNC
1.215730 I-PRO|B-BRA word[-2:]:NC
1.215205 I-PRO    -1:word.lower():bright
1.214922 I-PRO|B-BRA +1:word.lower():gw
1.214306 B-TYP    +1:word.lower():n
1.214023 I-PRO|I-TYP word[-2:]:nt
1.213996 I-PRO    -1:word.lower():ttdo
1.213198 I-PRO|B-TYP word[-3:]:sic
1.212986 B-TYP    +1:word.lower():adalah
1.212637 I-BRA    word.lower():mitchell
1.210829 I-PRO    -1:word.lower():mua
1.210475 O        word[-3:]:yan
1.210121 I-PRO    word.lower():perfeume
1.209402 B-PRO|B-BRA -1:word.lower():...
1.209245 I-PRO    word[-3:]:non

Top negative:
-1.764110 O        -1:word.lower():bwt
-1.822468 O        +1:word.lower():(
-1.825160 O        +1:word.lower():cuman
-1.855468 B-BRA    +1:neigbourFeatures:Thing
-1.914137 B-BRA    word.onThingList
-1.915545 O        word.isupper()
-1.923959 O        +1:word.lower

In [16]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-PRO  -> I-PRO|B-BRA 4.209740
I-TYP  -> I-TYP   4.055190
I-PRO|B-TYP -> I-PRO|I-TYP 3.830596
I-PRO  -> I-PRO|B-BRA 3.791523
I-PRO|I-TYP -> I-PRO|I-TYP 3.747679
B-TYP  -> I-TYP   3.698041
B-PRO|B-TYP -> I-PRO|B-BRA 3.417400
B-PRO|B-TYP -> I-PRO|I-TYP 3.386265
I-PRO  -> I-PRO   3.076848
I-PRO|B-BRA -> I-PRO|I-BRA 2.888759
B-PRO|B-BRA -> I-PRO|B-TYP 2.807363
I-PRO|I-BRA -> I-PRO|I-BRA 2.775058
O      -> O       2.569619
I-PRO|I-TYP -> I-PRO|B-BRA 2.519317
B-PRO  -> I-PRO   1.979509
I-PRO|I-BRA -> I-PRO|B-TYP 1.932849
B-PRO|B-BRA -> I-PRO|I-BRA 1.802191
O      -> B-PRO|B-BRA 1.549179
O      -> B-TYP   1.340997
O      -> B-BRA   1.300090

Top unlikely transitions:
I-PRO  -> I-BRA   -2.481249
I-TYP  -> I-BRA   -2.610049
B-PRO  -> B-BRA   -2.643754
I-PRO|I-TYP -> I-BRA   -2.673295
I-BRA  -> I-TYP   -2.751847
I-BRA  -> I-PRO   -2.786995
O      -> I-TYP   -2.799622
I-PRO|I-BRA -> I-TYP   -2.954213
I-PRO|I-TYP -> I-TYP   -2.960141
B-PRO|B-BRA -> I-PRO|I-TYP -2.971458
I-B

In [None]:
'''
for i in range(len(y_test)):
    print(y_test[i])
    print(y_pred[i])
    print(sequence_accuracy_score(y_test[i], y_pred[i]))
'''
#the semi-supervised/automatic labelling
#labels all unlabeled data using confidence score
#first, label everything and then filter the data based on confidence score limit (limit changes when there's no data found)
#then, remove the filtered data from unlabeled data and then add it to train data and then re-train the model
#do it until all data is labeled
'''
for z in range(len(X_unlabeled)):
    a = crf.predict_marginals_single(X_unlabeled[z])
    b = crf.predict_single(X_unlabeled[z])
    total = 0
    size = len(a)
    for i in range(size):
        total += a[i][b[i]]
    confidence = total / size
    if confidence > 0.75 and confidence < 0.85:
        for i in range(len(b)):
            print(unlabeled[z][i]+' '+b[i]+' ; ', end='')
        print('\n',end='')
'''
X_train_sup = deepcopy(X_train)
y_train_sup = deepcopy(y_train)
X_unlabeled_sup = deepcopy(X_unlabeled)
unlabeled_sup = deepcopy(unlabeled)
upper = 1
lower = 0.99
w = 0
label_for_unlabeled = []
while(X_unlabeled_sup):#or w < 1 (default: X_unlabeled_sup)
    print(str(w) + ' - data left:' + str(len(X_unlabeled_sup)) + ' - lower:' + str(lower) + ' - upper:' + str(upper))
    found = False
    noNew = True
    num = []
    w += 1
    for z in range(len(X_unlabeled_sup)): #X_test/X_unlabeled
        a = crf.predict_marginals_single(X_unlabeled_sup[z])
        b = crf.predict_single(X_unlabeled_sup[z])
        #b = y_pred[z]
        total = 0
        size = len(a)
        #print(unlabeled_sup[z])
        for i in range(size):
            total += a[i][b[i]]
        confidence = total / size
        if confidence < upper and confidence > lower:
            found = True
            row = []
            row.append(unlabeled_sup[z])
            row.append(b)
            label_for_unlabeled.append(row)
            fullO = True
            for qq in b:
                if qq != 'O':
                    fullO = False
                    break
            if fullO == False:
                X_train_sup.append(X_unlabeled_sup[z])
                y_train_sup.append(b)
                noNew = False
            num.append(z)
    if found == True:
        num.sort(reverse=True)
        for i in num:
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
        if noNew == False:
            crf.fit(X_train_sup, y_train_sup)
    else:
        lower -= 0.01
        #upper += 0.005
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
labels.remove('O') # remove 'O' label from evaluation
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
#print(label_for_unlabeled)


In [18]:
X_train_sup = deepcopy(X_train)
y_train_sup = deepcopy(y_train)
X_unlabeled_sup = deepcopy(X_unlabeled)
unlabeled_sup = deepcopy(unlabeled)

In [19]:
def splitSequence(sequence):
    res = []
    row = []
    for i in range(len(sequence)):
        if i == 0:
            row.append(sequence[i])
        else:
            if sequence[i - 1] != sequence[i] - 1:
                res.append(row)
                row = []
                row.append(sequence[i])
            else:
                row.append(sequence[i])
    if row:
        res.append(row)
    return res

def extractSequenceLabel(sentence, labels, conf, limit):
    allLabel = []
    allSequence = []
    sequence = []
    label = []
    c = 0
    s = 0
    found = False
    for j in range(len(sentence)):
        if labels[j] == 'O':
            if label:
                #if (c/s) >= limit:
                if found == False:
                    allLabel.append(label)
                    allSequence.append(sequence)
                found = False
                c = 0
                s = 0
                label = []
                sequence = []
        elif labels[j][0] == 'B':
            if label:
                #if (c/s) >= limit:
                if found == False:
                    allLabel.append(label)
                    allSequence.append(sequence)
                found = False
                c = 0
                s = 0
                label = []
                sequence = []
            label.append(labels[j])
            sequence.append(sentence[j])
            if conf[j][labels[j]] < limit:
                found = True
            c += conf[j][labels[j]]
            s += 1
        elif labels[j][0] == 'I':
            label.append(labels[j])
            sequence.append(sentence[j])
            if conf[j][labels[j]] < limit:
                found = True
            c += conf[j][labels[j]]
            s += 1
    if label:
        #if (c/s) >= limit:
        if found == False:
            allLabel.append(label)
            allSequence.append(sequence)
        c = 0
        s = 0
    return [allLabel, allSequence]
'''
ii = 10332
a = crf.predict_marginals_single(X_unlabeled_sup[ii])
b = crf.predict_single(X_unlabeled_sup[ii])
pp = extractSequenceLabel(unlabeled_sup[ii],b,a,0.7)
print(unlabeled_sup[ii])
print(b)
print(pp)
'''

def replaceLabel(sequence, labels, b, conf):
    found = False
    for c in range(len(b[1])):
        N = [i for i in range(len(sequence)) if sequence[i] in b[1][c]]
        if N:
            Nn = splitSequence(N)
            for s in Nn:
                L = []
                for i in s:
                    L.append(b[1][c].index(sequence[i]))
                notProOnly = False
                proExist = False
                for i in L:
                    if 'PRO' in b[0][c][i]:
                        proExist = True
                        break
                for i in L:
                    if ('PRO' in b[0][c][i] and '|' in b[0][c][i]) or ('BRA' in b[0][c][i]) or ('TYP' in b[0][c][i]):
                        notProOnly = True
                        break
                if ((proExist and notProOnly) and (proExist and len(s) > 1)) or (not proExist and notProOnly):
                    low = False
                    tot = 0
                    clear = True
                    for d in range(len(s)):
                        if conf[s[d]] > 0.8:
                            clear = False
                            #break
                        tot += conf[s[d]]
                    avg = tot / len(s)
                    if avg <= 0.8:
                    #if clear:
                        low = True
                    if low:
                        for d in range(len(s)):
                            if len(sequence[s[d]]) > 0:
                                newLabel = b[0][c][L[d]]
                                if d < len(s):
                                    if (d == 0) or ((d - 1) > 0 and labels[s[d] - 1] == 'O') or newLabel[:5] == 'B-PRO':
                                        newLabel = newLabel[:0] + 'B' + newLabel[1:]
                                        if '|' in newLabel:
                                            if newLabel[6:7] == 'I':
                                                newLabel = newLabel[:6] + 'B' + newLabel[7:]
                                    if (d == 0) or ((d - 1) > 0 and (labels[s[d] - 1][0] == 'I' or labels[s[d] - 1][0] == 'B')) and newLabel[0] == 'B':
                                        if labels[s[d] - 1][2:5] == newLabel[2:5]:
                                            newLabel = newLabel[:0] + 'I' + newLabel[1:]
                                    if len(newLabel) > 5 and ((d - 1) > 0 and len(labels[s[d] - 1]) > 5):
                                        if (d == 0) or ((d - 1) > 0 and (labels[s[d] - 1][6] == 'I' or labels[s[d] - 1][6] == 'B')) and newLabel[6] == 'B':
                                            if labels[s[d] - 1][8:11] == newLabel[8:11]:
                                                newLabel = newLabel[:6] + 'I' + newLabel[7:]
                                    if (d < (len(s) - 1) and (labels[s[d] + 1][0] == 'B')) and newLabel[0] == 'B':
                                        if labels[s[d] + 1][2:5] == newLabel[2:5]:
                                            labels[s[d] + 1] = labels[s[d] + 1][:0] + 'I' + labels[s[d] + 1][1:]
                                    if len(newLabel) > 5 and (d < (len(s) - 1) and len(labels[s[d] + 1]) > 5):
                                        if (d < (len(s) - 1) and (labels[s[d] + 1][6] == 'B')) and newLabel[6] == 'B':
                                            if labels[s[d]+ 1][8:11] == newLabel[8:11]:
                                                labels[s[d] + 1] = labels[s[d] + 1][:6] + 'I' + labels[s[d] + 1][7:]
                                if 'PRO' in labels[s[d]] and 'PRO' not in newLabel:
                                    labels[s[d]] = labels[s[d]]
                                else:
                                    labels[s[d]] = newLabel
                                found = True
                    
            '''
            L = []
            for i in N:
                L.append(b[1][c].index(sequence[i]))
            for d in range(len(N)):
                newLabel = b[0][c][L[d]]
                if d < len(N) - 1:
                    if (d == 0) or ((N[d] + 1) != N[d + 1]) or ((N[d] - 1) > 0 and labels[N[d] - 1] == 'O') or newLabel[:5] == 'B-PRO':
                        newLabel = newLabel[:0] + 'B' + newLabel[1:]
                        if '|' in newLabel:
                            if newLabel[6:7] == 'I':
                                newLabel = newLabel[:6] + 'B' + newLabel[7:]
                labels[N[d]] = newLabel
            '''
        else:
            return False
    if not found:
        return False
    else:
        return labels

#ll = replaceLabel(unlabeled_sup[ii],b,pp)
#print(ll)
#print(replaceLabel(['a','b','c','d','e','f'],['O','O','O','O','O','O'],[[['I-BRA'],['I-TYP'],['B-PRO|B-BRA','I-PRO|B-TYP','I-PRO|I-TYP']],[['b'],['c'],['d','e','f']]],[0,0,0,0,0,0]))


In [15]:
def filterPro(labels):
    allLabel = []
    allLabelIndex = []
    label = []
    labelIndex = []
    for j in range(len(labels)):
        if labels[j] == 'O':
            if label:
                allLabel.append(label)
                allLabelIndex.append(labelIndex)
                label = []
                labelIndex = []
        elif labels[j][0] == 'B':
            if label:
                allLabel.append(label)
                allLabelIndex.append(labelIndex)
                label = []
                labelIndex = []
            label.append(labels[j])
            labelIndex.append(j)
        elif labels[j][0] == 'I':
            label.append(labels[j])
            labelIndex.append(j)
    if label:
        allLabel.append(label)
        allLabelIndex.append(labelIndex)
    for i in range(len(allLabel)):
        pizdec = True
        lb = ''
        for j in allLabel[i]:
            if '|' not in j:
                pizdec = False
                break
            else:
                if lb == '':
                    lb = j[8:11]
                else:
                    if j[8:11] != lb:
                        pizdec = False
                        break
        if pizdec:
            for j in range(len(allLabel[i])):
                labels[allLabelIndex[i][j]] = allLabel[i][j][6:11]
    return labels
#aa = ['O','B-PRO|B-BRA','I-PRO|I-BRA','O','B-BRA','O']
#print(aa)
#aa = filterPro(aa)
#print(aa)

In [15]:
def ruleReplace(feature, label):
    for i in range(len(feature)):
        if feature[i]['word.onList'] == True or feature[i]['word.onListAbb'] == True:
            if 'BRA' not in label[i]:
                if label[i] == 'O' or 'TYP' in label[i]:
                    if 'BRA' in label[i - 1]:
                        label[i] = 'I-BRA'
                    else:
                        label[i] = 'B-BRA'
                elif 'PRO' in label[i]:
                    if 'BRA' in label[i - 1]:
                        label[i] = label[i][:6] + 'I-BRA'
                    else:
                        label[i] = label[i][:6] + 'B-BRA'
    return label

In [134]:
#per beginning and inner sequence, don't really care what entity inside those, especially if nested
highLimit = 0.95
w = 0
label_for_unlabeled = []
while(X_unlabeled_sup):#or w < 1 (default: X_unlabeled_sup)
    print(str(w) + ' - data left:' + str(len(X_unlabeled_sup)))
    found = False
    noNew = True
    num = []
    labelAll = []
    labelConfAll = []
    w += 1
    highAll = []
    highIndex = []
    #Xall = []
    #newBias = 1.1**(-w)
    for z in range(len(X_unlabeled_sup)): #X_test/X_unlabeled/
        a = crf.predict_marginals_single(X_unlabeled_sup[z])
        b = crf.predict_single(X_unlabeled_sup[z])
        #Xall.append(X_unlabeled_sup[z])
        labelAll.append(b)
        conf = []
        for i in range(len(a)):
            conf.append(a[i][b[i]])
        labelConfAll.append(conf)
        high = extractSequenceLabel(unlabeled_sup[z],b,a,highLimit)
        if high[0]:
            #X_train_sup.append(X_unlabeled_sup[z])
            #y_train_sup.append(labelAll[z])
            highAll.append(high)
            highIndex.append(z)
            row = []
            row.append(unlabeled_sup[z])
            row.append(b)
            label_for_unlabeled.append(row)
            num.append(z)
    #print('before replace: '+str(len(num)))
    print('# of Hi Conf: ' + str(len(highAll)))
    if num and len(highAll) >= 50:
        added = False
        if highAll:
            for a in range(len(unlabeled_sup)):
                changed = False
                if a not in highIndex:
                    for qq in range(len(highAll)):
                        if a != highIndex[qq]:
                            labelNew = replaceLabel(unlabeled_sup[a],labelAll[a],highAll[qq],labelConfAll[a])
                            if labelNew:
                                #print(labelAll[a])
                                labelAll[a] = labelNew
                                #print(labelAll[a])
                                changed = True
                                break
                #labelReplace = ruleReplace(Xall[a],labelAll[a])
                #if labelReplace != labelAll[a]:
                #    labelAll[a] = labelReplace
                #    changed = True
                if changed:
                    labelAll[a] = filterPro(labelAll[a])
                    #for l in range(len(X_unlabeled_sup[a])):
                    #    if newBias <= 0:
                    #        X_unlabeled_sup[a][l]['bias'] = 0
                    #    else:
                    #        X_unlabeled_sup[a][l]['bias'] = newBias
                    X_train_sup.append(X_unlabeled_sup[a])
                    y_train_sup.append(labelAll[a])
                    if a not in num:
                        row = []
                        row.append(unlabeled_sup[a])
                        row.append(labelAll[a])
                        label_for_unlabeled.append(row)
                        num.append(a)
                    added = True
        #print('after replace: '+str(len(num)))
        if added:
            crf.fit(X_train_sup, y_train_sup)
        num.sort(reverse=True) #num
        for i in num: #num
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
        y_pred = crf.predict(X_test)
        evaluatione(y_pred)
    else:
        num = []
        for a in range(len(unlabeled_sup)):
            row = []
            row.append(unlabeled_sup[a])
            row.append(labelAll[a])
            label_for_unlabeled.append(row)
            num.append(a)
        num.sort(reverse=True)
        for i in num:
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
labels.remove('O') # remove 'O' label from evaluation
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
#print(label_for_unlabeled)

0 - data left:16853
# of Hi Conf: 1566
PRO
Precision : 0.7634854771784232
Recall : 0.6237288135593221
F1 : 0.6865671641791046
BRA
Precision : 0.9163090128755365
Recall : 0.8767967145790554
F1 : 0.8961175236096537
TYP
Precision : 0.4896551724137931
Recall : 0.35323383084577115
F1 : 0.41040462427745666
O
Precision : 0.9519382288055468
Recall : 0.9890307793058284
F1 : 0.9701300786895777
Overall Without O
Precision : 0.7832058949066059
Recall : 0.6937945066124109
F1 : 0.7339144220859674
Overall With O
Precision : 0.9285474680915912
Recall : 0.9481032294457763
F1 : 0.9373843460085244
1 - data left:12714
# of Hi Conf: 1140
PRO
Precision : 0.7666666666666667
Recall : 0.6237288135593221
F1 : 0.6878504672897197
BRA
Precision : 0.9102564102564102
Recall : 0.8747433264887063
F1 : 0.8921465968586387
TYP
Precision : 0.4866666666666667
Recall : 0.36318407960199006
F1 : 0.41595441595441596
O
Precision : 0.9535104364326376
Recall : 0.9872298624754421
F1 : 0.9700772200772202
Overall Without O
Precision

In [20]:
#per beginning and inner sequence, don't really care what entity inside those, especially if nested
highLimit = 0.99
w = 0
label_for_unlabeled = []
while(X_unlabeled_sup):#or w < 1 (default: X_unlabeled_sup)
    print(str(w) + ' - data left:' + str(len(X_unlabeled_sup)))
    found = False
    noNew = True
    num = []
    labelAll = []
    labelConfAll = []
    w += 1
    highAll = []
    highIndex = []
    #Xall = []
    #newBias = 1.1**(-w)
    for z in range(len(X_unlabeled_sup)): #X_test/X_unlabeled/
        a = crf.predict_marginals_single(X_unlabeled_sup[z])
        b = crf.predict_single(X_unlabeled_sup[z])
        #Xall.append(X_unlabeled_sup[z])
        labelAll.append(b)
        conf = []
        for i in range(len(a)):
            conf.append(a[i][b[i]])
        labelConfAll.append(conf)
        high = extractSequenceLabel(unlabeled_sup[z],b,a,highLimit)
        if high[0]:
            #X_train_sup.append(X_unlabeled_sup[z])
            #y_train_sup.append(labelAll[z])
            highAll.append(high)
            highIndex.append(z)
            row = []
            row.append(unlabeled_sup[z])
            row.append(b)
            label_for_unlabeled.append(row)
            num.append(z)
    #print('before replace: '+str(len(num)))
    print('# of Hi Conf: ' + str(len(highAll)))
    if num and len(highAll) >= 50:
        allSequence = []
        for i in highAll:
            allSequenceIndexRow = []
            for j in range(len(i[1])):
                allSequence.append(' '.join(i[1][j]).lower())
        counter = Counter(allSequence)
        newHighAll = []
        for i in highAll:
            newHighAllRow = []
            labelRow = []
            sequenceRow = []
            for j in range(len(i[1])):
                check = ' '.join(i[1][j]).lower()
                if counter[check] >= 2:
                    labelRow.append(i[0][j])
                    sequenceRow.append(i[1][j])
                else:
                    for k in i[1][j]:
                        if counter[k.lower()] >= 2:
                            labelRow.append(i[0][j])
                            sequenceRow.append(i[1][j])
            newHighAllRow.append(labelRow)
            newHighAllRow.append(sequenceRow)
            newHighAll.append(newHighAllRow)
        added = False
        if newHighAll:
            for a in range(len(unlabeled_sup)):
                changed = False
                if a not in highIndex:
                    for qq in range(len(newHighAll)):
                        if a != highIndex[qq]:
                            labelNew = replaceLabel(unlabeled_sup[a],labelAll[a],newHighAll[qq],labelConfAll[a])
                            if labelNew:
                                #print(labelAll[a])
                                labelAll[a] = labelNew
                                #print(labelAll[a])
                                changed = True
                                break
                #labelReplace = ruleReplace(Xall[a],labelAll[a])
                #if labelReplace != labelAll[a]:
                #    labelAll[a] = labelReplace
                #    changed = True
                if changed:
                    labelAll[a] = filterPro(labelAll[a])
                    #for l in range(len(X_unlabeled_sup[a])):
                    #    if newBias <= 0:
                    #        X_unlabeled_sup[a][l]['bias'] = 0
                    #    else:
                    #        X_unlabeled_sup[a][l]['bias'] = newBias
                    X_train_sup.append(X_unlabeled_sup[a])
                    y_train_sup.append(labelAll[a])
                    if a not in num:
                        row = []
                        row.append(unlabeled_sup[a])
                        row.append(labelAll[a])
                        label_for_unlabeled.append(row)
                        num.append(a)
                    added = True
        #print('after replace: '+str(len(num)))
        if added:
            crf.fit(X_train_sup, y_train_sup)
        else:
            num = []
            for a in range(len(unlabeled_sup)):
                row = []
                row.append(unlabeled_sup[a])
                row.append(labelAll[a])
                label_for_unlabeled.append(row)
                num.append(a)
        num.sort(reverse=True) #num
        for i in num: #num
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
        y_pred = crf.predict(X_test)
        evaluatione(y_pred)
    else:
        num = []
        for a in range(len(unlabeled_sup)):
            row = []
            row.append(unlabeled_sup[a])
            row.append(labelAll[a])
            label_for_unlabeled.append(row)
            num.append(a)
        num.sort(reverse=True)
        for i in num:
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
labels.remove('O') # remove 'O' label from evaluation
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
#print(label_for_unlabeled)

0 - data left:16853
# of Hi Conf: 308
PRO
Precision : 0.7735042735042735
Recall : 0.6135593220338983
F1 : 0.6843100189035918
BRA
Precision : 0.9173913043478261
Recall : 0.86652977412731
F1 : 0.8912354804646252
TYP
Precision : 0.5285714285714286
Recall : 0.3681592039800995
F1 : 0.4340175953079179
O
Precision : 0.9490004722178499
Recall : 0.9870661427635887
F1 : 0.9676590963807079
Overall Without O
Precision : 0.79470618824416
Recall : 0.688708036622584
F1 : 0.7356466645165042
Overall With O
Precision : 0.9276112067903873
Recall : 0.945705824284304
F1 : 0.9354960417307979
1 - data left:15711
# of Hi Conf: 462
PRO
Precision : 0.7854077253218884
Recall : 0.6203389830508474
F1 : 0.6931818181818182
BRA
Precision : 0.9112554112554112
Recall : 0.864476386036961
F1 : 0.8872497365648051
TYP
Precision : 0.5035460992907801
Recall : 0.35323383084577115
F1 : 0.415204678362573
O
Precision : 0.9494647355163728
Recall : 0.9873935821872953
F1 : 0.9680577849117173
Overall Without O
Precision : 0.79012149

In [86]:
allSequence = []
for i in highAll:
    allSequenceIndexRow = []
    for j in range(len(i[1])):
        allSequence.append(' '.join(i[1][j]).lower())
counter = Counter(allSequence)
newHighAll = []
for i in highAll:
    newHighAllRow = []
    labelRow = []
    sequenceRow = []
    for j in range(len(i[1])):
        check = ' '.join(i[1][j]).lower()
        if counter[check] >= 2:
            labelRow.append(i[0][j])
            sequenceRow.append(i[1][j])
        else:
            for k in i[1][j]:
                if counter[k.lower()] >= 2:
                    labelRow.append(i[0][j])
                    sequenceRow.append(i[1][j])
    newHighAllRow.append(labelRow)
    newHighAllRow.append(sequenceRow)
    newHighAll.append(newHighAllRow)
kj = 0
for i in range(len(highAll)):
    if highAll[i] != newHighAll[i]:
        #print(highAll[i])
        #print(newHighAll[i])
        kj += 1
print(kj)

142


In [None]:
#per label replace
upperHigh = 1
lowerHigh = 0.98
upperLow = 0.8
lowerLow = 0
w = 0
label_for_unlabeled = []
while(X_unlabeled_sup):#or w < 1 (default: X_unlabeled_sup)
    print(str(w) + ' - data left:' + str(len(X_unlabeled_sup)))
    found = False
    noNew = True
    num = []
    confidenceAll = []
    labelAll = []
    w += 1
    tokensHigh = []
    labelsHigh = []
    for z in range(len(X_unlabeled_sup)): #X_test/X_unlabeled
        a = crf.predict_marginals_single(X_unlabeled_sup[z])
        b = crf.predict_single(X_unlabeled_sup[z])
        labelAll.append(b)
        #b = y_pred[z]
        #total = 0
        size = len(a)
        cRow = []
        highFound = False
        for i in range(size):
            #total += a[i][b[i]]
            cRow.append(a[i][b[i]])
            if b[i] != 'O':
                if a[i][b[i]] >= lowerHigh and a[i][b[i]] <= upperHigh:
                    labelsHigh.append(b[i])
                    tokensHigh.append(unlabeled_sup[z][i])
                    highFound = True
        if highFound:
            row = []
            row.append(unlabeled_sup[z])
            row.append(b)
            label_for_unlabeled.append(row)
            num.append(z)
        #confidence = total / size
        confidenceAll.append(cRow)
    if num:
        added = False
        if tokensHigh:
            for a in range(len(unlabeled_sup)):
                lowConf = False
                for b in range(len(unlabeled_sup[a])):
                    if unlabeled_sup[a][b] in tokensHigh:
                        pos = tokensHigh.index(unlabeled_sup[a][b])
                        newLabel = labelsHigh[pos]
                        if (b == 0 and newLabel[0] == 'I') or (labelAll[a][b - 1][0] == 'O' and newLabel[0] == 'I'):
                            newLabel = newLabel[:0] + 'B' + newLabel[1:]
                        labelAll[a][b] = newLabel
                        if confidenceAll[a][b] >= lowerLow and confidenceAll[a][b] <= upperLow:
                            lowConf = True
                if lowConf:
                    X_train_sup.append(X_unlabeled_sup[a])
                    y_train_sup.append(labelAll[a])
                    row = []
                    row.append(unlabeled_sup[a])
                    row.append(labelAll[a])
                    label_for_unlabeled.append(row)
                    num.append(a)
                    added = True
        if added:
            crf.fit(X_train_sup, y_train_sup)
        num.sort(reverse=True)
        for i in num:
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
    else:
        for a in range(len(unlabeled_sup)):
            row = []
            row.append(unlabeled_sup[a])
            row.append(labelAll[a])
            label_for_unlabeled.append(row)
            num.append(a)
        num.sort(reverse=True)
        for i in num:
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
labels.remove('O') # remove 'O' label from evaluation
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
#print(label_for_unlabeled)

In [9]:
dataTest_labels = []
for sentence in dataTest:
    aa = [token[1] for token in sentence]
    dataTest_labels.append(aa)
    
dataTest_token = []
for sentence in dataTest:
    aa = [token[0] for token in sentence]
    dataTest_token.append(aa)

#print(dataTest_labels[0])
#print(dataTest_token[0])

In [10]:
#per entity evaluation
def evaluatione(y_pred):
    bra_data = []
    bra_guess = []
    typ_data = []
    typ_guess = []
    pro_data = []
    pro_guess = []
    o_data = []
    o_guess = []
    for i in range(len(dataTest)):
        bra_data_row = []
        bra_guess_row = []
        typ_data_row = []
        typ_guess_row = []
        pro_data_row = []
        pro_guess_row = []
        o_data_row = []
        o_guess_row = []
        sen_pro_data = ''
        sen_pro_guess = ''
        sen_typ_data = ''
        sen_typ_guess = ''
        sen_bra_data = ''
        sen_bra_guess = ''
        for j in range(len(dataTest[i])):        
            if dataTest_labels[i][j] == 'O':
                o_data_row.append(dataTest_token[i][j])
                if sen_pro_data:
                    pro_data_row.append(sen_pro_data)
                    sen_pro_data = ''
                elif sen_bra_data:
                    bra_data_row.append(sen_bra_data)
                    sen_bra_data = ''
                elif sen_typ_data:
                    typ_data_row.append(sen_typ_data)
                    sen_typ_data = ''
            else:
                if '|' in dataTest_labels[i][j]:
                    if sen_pro_data and dataTest_labels[i][j][0] == 'B':
                        pro_data_row.append(sen_pro_data)
                        sen_pro_data = ''
                        sen_pro_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][0] == 'B':
                        sen_pro_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][0] == 'I':
                        sen_pro_data += ' ' + dataTest_token[i][j]
                    if dataTest_labels[i][j][8:] == 'BRA':
                        if sen_bra_data and dataTest_labels[i][j][6:7] == 'B':
                            bra_data_row.append(sen_bra_data)
                            sen_bra_data = ''
                            sen_bra_data += dataTest_token[i][j]
                        elif dataTest_labels[i][j][6:7] == 'B':
                            sen_bra_data += dataTest_token[i][j]
                        elif dataTest_labels[i][j][6:7] == 'I':
                            sen_bra_data += ' ' + dataTest_token[i][j]
                    elif dataTest_labels[i][j][8:] == 'TYP':
                        if sen_typ_data and dataTest_labels[i][j][6:7] == 'B':
                            typ_data_row.append(sen_typ_data)
                            sen_typ_data = ''
                            sen_typ_data += dataTest_token[i][j]
                        elif dataTest_labels[i][j][6:7] == 'B':
                            sen_typ_data += dataTest_token[i][j]
                        elif dataTest_labels[i][j][6:7] == 'I':
                            sen_typ_data += ' ' + dataTest_token[i][j]
                elif dataTest_labels[i][j][2:5] == 'PRO':
                    if sen_pro_data and dataTest_labels[i][j][0] == 'B':
                        pro_data_row.append(sen_pro_data)
                        sen_pro_data = ''
                        sen_pro_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][0] == 'B':
                        sen_pro_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][0] == 'I':
                        sen_pro_data += ' ' + dataTest_token[i][j]
                elif dataTest_labels[i][j][2:5] == 'BRA':
                    if sen_bra_data and dataTest_labels[i][j][0] == 'B':
                        bra_data_row.append(sen_bra_data)
                        sen_bra_data = ''
                        sen_bra_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][0] == 'B':
                        sen_bra_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][0] == 'I':
                        sen_bra_data += ' ' + dataTest_token[i][j]
                elif dataTest_labels[i][j][2:5] == 'TYP':
                    if sen_typ_data and dataTest_labels[i][j][0] == 'B':
                        typ_data_row.append(sen_typ_data)
                        sen_typ_data = ''
                        sen_typ_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][0] == 'B':
                        sen_typ_data += dataTest_token[i][j]
                    elif dataTest_labels[i][j][0] == 'I':
                        sen_typ_data += ' ' + dataTest_token[i][j]
            if y_pred[i][j] == 'O':
                o_guess_row.append(dataTest_token[i][j])
                if sen_pro_guess:
                    pro_guess_row.append(sen_pro_guess)
                    sen_pro_guess = ''
                elif sen_bra_guess:
                    bra_guess_row.append(sen_bra_guess)
                    sen_bra_guess = ''
                elif sen_typ_guess:
                    typ_guess_row.append(sen_typ_guess)
                    sen_typ_guess = ''
            else:
                if '|' in y_pred[i][j]:
                    if sen_pro_guess and y_pred[i][j][0] == 'B':
                        pro_guess_row.append(sen_pro_guess)
                        sen_pro_guess = ''
                        sen_pro_guess += dataTest_token[i][j]
                    elif y_pred[i][j][0] == 'B':
                        sen_pro_guess += dataTest_token[i][j]
                    elif y_pred[i][j][0] == 'I':
                        sen_pro_guess += ' ' + dataTest_token[i][j]
                    if y_pred[i][j][8:] == 'BRA':
                        if sen_bra_guess and y_pred[i][j][6:7] == 'B':
                            bra_guess_row.append(sen_bra_guess)
                            sen_bra_guess = ''
                            sen_bra_guess += dataTest_token[i][j]
                        elif y_pred[i][j][6:7] == 'B':
                            sen_bra_guess += dataTest_token[i][j]
                        elif y_pred[i][j][6:7] == 'I':
                            sen_bra_guess += ' ' + dataTest_token[i][j]
                    elif y_pred[i][j][8:] == 'TYP':
                        if sen_typ_guess and y_pred[i][j][6:7] == 'B':
                            typ_guess_row.append(sen_typ_guess)
                            sen_typ_guess = ''
                            sen_typ_guess += dataTest_token[i][j]
                        elif y_pred[i][j][6:7] == 'B':
                            sen_typ_guess += dataTest_token[i][j]
                        elif y_pred[i][j][6:7] == 'I':
                            sen_typ_guess += ' ' + dataTest_token[i][j]
                elif y_pred[i][j][2:5] == 'PRO':
                    if sen_pro_guess and y_pred[i][j][0] == 'B':
                        pro_guess_row.append(sen_pro_guess)
                        sen_pro_guess = ''
                        sen_pro_guess += dataTest_token[i][j]
                    elif y_pred[i][j][0] == 'B':
                        sen_pro_guess += dataTest_token[i][j]
                    elif y_pred[i][j][0] == 'I':
                        sen_pro_guess += ' ' + dataTest_token[i][j]
                elif y_pred[i][j][2:5] == 'BRA':
                    if sen_bra_guess and y_pred[i][j][0] == 'B':
                        bra_guess_row.append(sen_bra_guess)
                        sen_bra_guess = ''
                        sen_bra_guess += dataTest_token[i][j]
                    elif y_pred[i][j][0] == 'B':
                        sen_bra_guess += dataTest_token[i][j]
                    elif y_pred[i][j][0] == 'I':
                        sen_bra_guess += ' ' + dataTest_token[i][j]
                elif y_pred[i][j][2:5] == 'TYP':
                    if sen_typ_guess and y_pred[i][j][0] == 'B':
                        typ_guess_row.append(sen_typ_guess)
                        sen_typ_guess = ''
                        sen_typ_guess += dataTest_token[i][j]
                    elif y_pred[i][j][0] == 'B':
                        sen_typ_guess += dataTest_token[i][j]
                    elif y_pred[i][j][0] == 'I':
                        sen_typ_guess += ' ' + dataTest_token[i][j]
        if sen_pro_data:
            pro_data_row.append(sen_pro_data)
        if sen_bra_data:
            bra_data_row.append(sen_bra_data)
        if sen_typ_data:
            typ_data_row.append(sen_typ_data)
        if sen_pro_guess:
            pro_guess_row.append(sen_pro_guess)
        if sen_bra_guess:
            bra_guess_row.append(sen_bra_guess)
        if sen_typ_guess:
            typ_guess_row.append(sen_typ_guess)
        o_data.append(o_data_row)
        o_guess.append(o_guess_row)
        pro_data.append(pro_data_row)
        pro_guess.append(pro_guess_row)
        bra_data.append(bra_data_row)
        bra_guess.append(bra_guess_row)
        typ_data.append(typ_data_row)
        typ_guess.append(typ_guess_row)
    '''
    for i in range(len(bra_data)):
        print('PRO')
        print(pro_data[i])
        print(pro_guess[i])
        print('BRA')
        print(bra_data[i])
        print(bra_guess[i])
        print('TYP')
        print(typ_data[i])
        print(typ_guess[i])
        print('---------')
    '''
    #none is the actual number from data test, r is number of guess right, g is number of guess
    bra = 0
    bra_r = 0
    bra_g = 0
    typ = 0
    typ_r = 0
    typ_g = 0
    pro = 0
    pro_r = 0
    pro_g = 0
    o = 0
    o_r = 0
    o_g = 0
    for i in range(len(pro_data)):
        for j in range(len(pro_data[i])):
            pro += 1
        for j in range(len(pro_guess[i])):
            pro_g += 1
            if pro_guess[i][j] in pro_data[i]:
                pro_r += 1
                pro_data[i].remove(pro_guess[i][j])
        for j in range(len(bra_data[i])):
            bra += 1
        for j in range(len(bra_guess[i])):
            bra_g += 1
            if bra_guess[i][j] in bra_data[i]:
                bra_r += 1
                bra_data[i].remove(bra_guess[i][j])
        for j in range(len(typ_data[i])):
            typ += 1
        for j in range(len(typ_guess[i])):
            typ_g += 1
            if typ_guess[i][j] in typ_data[i]:
                typ_r += 1
                typ_data[i].remove(typ_guess[i][j])
        for j in range(len(o_data[i])):
            o += 1
        for j in range(len(o_guess[i])):
            o_g += 1
            if o_guess[i][j] in o_data[i]:
                o_r += 1
                o_data[i].remove(o_guess[i][j])

    print('PRO')
    precision_pro = pro_r/pro_g
    recall_pro = pro_r/pro
    f1_pro = 2 * ((precision_pro * recall_pro)/(precision_pro + recall_pro))
    print('Precision : ' + str(precision_pro))
    print('Recall : ' + str(recall_pro))
    print('F1 : ' + str(f1_pro))
    print('BRA')
    precision_bra = bra_r/bra_g
    recall_bra = bra_r/bra
    f1_bra = 2 * ((precision_bra * recall_bra)/(precision_bra + recall_bra))
    print('Precision : ' + str(precision_bra))
    print('Recall : ' + str(recall_bra))
    print('F1 : ' + str(f1_bra))
    print('TYP')
    precision_typ = typ_r/typ_g
    recall_typ = typ_r/typ
    f1_typ = 2 * ((precision_typ * recall_typ)/(precision_typ + recall_typ))
    print('Precision : ' + str(precision_typ))
    print('Recall : ' + str(recall_typ))
    print('F1 : ' + str(f1_typ))
    print('O')
    precision_o = o_r/o_g
    recall_o = o_r/o
    f1_o = 2 * ((precision_o * recall_o)/(precision_o + recall_o))
    print('Precision : ' + str(precision_o))
    print('Recall : ' + str(recall_o))
    print('F1 : ' + str(f1_o))
    print('Overall Without O')
    total = pro + typ + bra
    precision = ((pro * precision_pro) + (bra * precision_bra) + (typ * precision_typ)) / total
    recall = ((pro * recall_pro) + (bra * recall_bra) + (typ * recall_typ)) / total
    f1 = ((pro * f1_pro) + (bra * f1_bra) + (typ * f1_typ)) / total
    print('Precision : ' + str(precision))
    print('Recall : ' + str(recall))
    print('F1 : ' + str(f1))
    print('Overall With O')
    total = pro + typ + bra + o
    precision = ((pro * precision_pro) + (bra * precision_bra) + (typ * precision_typ) + (o * precision_o)) / total
    recall = ((pro * recall_pro) + (bra * recall_bra) + (typ * recall_typ) + (o * recall_o)) / total
    f1 = ((pro * f1_pro) + (bra * f1_bra) + (typ * f1_typ) + (o * f1_o)) / total
    print('Precision : ' + str(precision))
    print('Recall : ' + str(recall))
    print('F1 : ' + str(f1))
#evaluatione(y_pred)

In [16]:
#write labeled data (previously unlabeled)
f = open('labeled_automatically (final) fixedztos hilo not avg min 2 0.99 50.tsv.tsv','w', encoding='utf-8') 
for i in range(len(label_for_unlabeled)):
    for j in range(len(label_for_unlabeled[i][0])):
        f.write(label_for_unlabeled[i][0][j])
        f.write('\t')
        f.write(label_for_unlabeled[i][1][j])
        f.write('\n')
        #print(label_for_unlabeled[i][j])
        #print(label_for_unlabeled[i + 1][j])
    f.write('\n')
f.write('\n')
f.close()

In [None]:
#per beginning and inner sequence, don't really care what entity inside those, especially if nested
highLimit = 0.99
w = 0
label_for_unlabeled = []
while(X_unlabeled_sup):#or w < 1 (default: X_unlabeled_sup)
    print(str(w) + ' - data left:' + str(len(X_unlabeled_sup)))
    found = False
    noNew = True
    num = []
    labelAll = []
    labelConfAll = []
    w += 1
    highAll = []
    highIndex = []
    #Xall = []
    #newBias = 1.1**(-w)
    for z in range(len(X_unlabeled_sup)): #X_test/X_unlabeled/
        a = crf.predict_marginals_single(X_unlabeled_sup[z])
        b = crf.predict_single(X_unlabeled_sup[z])
        #Xall.append(X_unlabeled_sup[z])
        labelAll.append(b)
        conf = []
        for i in range(len(a)):
            conf.append(a[i][b[i]])
        labelConfAll.append(conf)
        high = extractSequenceLabel(unlabeled_sup[z],b,a,highLimit)
        if high[0]:
            #X_train_sup.append(X_unlabeled_sup[z])
            #y_train_sup.append(labelAll[z])
            highAll.append(high)
            highIndex.append(z)
            row = []
            row.append(unlabeled_sup[z])
            row.append(b)
            label_for_unlabeled.append(row)
    #print('before replace: '+str(len(num)))
    num = []
    allSequence = []
    for i in highAll:
        allSequenceIndexRow = []
        for j in range(len(i[1])):
            allSequence.append(' '.join(i[1][j]).lower())
    counter = Counter(allSequence)
    newHighAll = []
    for i in highAll:
        newHighAllRow = []
        labelRow = []
        sequenceRow = []
        for j in range(len(i[1])):
            check = ' '.join(i[1][j]).lower()
            if counter[check] >= 2:
                labelRow.append(i[0][j])
                sequenceRow.append(i[1][j])
            else:
                for k in i[1][j]:
                    if counter[k.lower()] >= 2:
                        labelRow.append(i[0][j])
                        sequenceRow.append(i[1][j])
        newHighAllRow.append(labelRow)
        newHighAllRow.append(sequenceRow)
        newHighAll.append(newHighAllRow)
    added = False
    for i in range(len(newHighAll)):
        if newHighAll[i][0]:
            num.append(i)
            X_train_sup.append(X_unlabeled_sup[highIndex[i]])
            y_train_sup.append(labelAll[highIndex[i]])
            added = True
    print('# of hi conf that is not alone: ' + str(len(num)))
    if num and len(num) >= 50:
        if newHighAll:
            for a in range(len(unlabeled_sup)):
                changed = False
                if a not in highIndex:
                    for qq in range(len(newHighAll)):
                        if a != highIndex[qq]:
                            labelNew = replaceLabel(unlabeled_sup[a],labelAll[a],newHighAll[qq],labelConfAll[a])
                            if labelNew:
                                #print(labelAll[a])
                                labelAll[a] = labelNew
                                #print(labelAll[a])
                                changed = True
                                break
                #labelReplace = ruleReplace(Xall[a],labelAll[a])
                #if labelReplace != labelAll[a]:
                #    labelAll[a] = labelReplace
                #    changed = True
                if changed:
                    labelAll[a] = filterPro(labelAll[a])
                    #for l in range(len(X_unlabeled_sup[a])):
                    #    if newBias <= 0:
                    #        X_unlabeled_sup[a][l]['bias'] = 0
                    #    else:
                    #        X_unlabeled_sup[a][l]['bias'] = newBias
                    X_train_sup.append(X_unlabeled_sup[a])
                    y_train_sup.append(labelAll[a])
                    if a not in num:
                        row = []
                        row.append(unlabeled_sup[a])
                        row.append(labelAll[a])
                        label_for_unlabeled.append(row)
                        num.append(a)
                    added = True
        #print('after replace: '+str(len(num)))
        if added:
            crf.fit(X_train_sup, y_train_sup)
        else:
            num = []
            for a in range(len(unlabeled_sup)):
                row = []
                row.append(unlabeled_sup[a])
                row.append(labelAll[a])
                label_for_unlabeled.append(row)
                num.append(a)
        num.sort(reverse=True) #num
        for i in num: #num
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
        y_pred = crf.predict(X_test)
        evaluatione(y_pred)
    else:
        num = []
        for a in range(len(unlabeled_sup)):
            row = []
            row.append(unlabeled_sup[a])
            row.append(labelAll[a])
            label_for_unlabeled.append(row)
            num.append(a)
        num.sort(reverse=True)
        for i in num:
            X_unlabeled_sup.pop(i)
            unlabeled_sup.pop(i)
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
labels.remove('O') # remove 'O' label from evaluation
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
print(flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
#print(label_for_unlabeled)

0 - data left:16853
# of hi conf that is not alone: 257
PRO
Precision : 0.7913043478260869
Recall : 0.6169491525423729
F1 : 0.6933333333333335
BRA
Precision : 0.9254385964912281
Recall : 0.86652977412731
F1 : 0.8950159066808059
TYP
Precision : 0.539568345323741
Recall : 0.373134328358209
F1 : 0.4411764705882353
O
Precision : 0.9490886235072281
Recall : 0.9888670595939751
F1 : 0.9685695958948043
Overall Without O
Precision : 0.8062834349033525
Recall : 0.6907426246185148
F1 : 0.7416913026196553
Overall With O
Precision : 0.9292920503302983
Recall : 0.9475391341136652
F1 : 0.9371182685376654
1 - data left:15779
# of hi conf that is not alone: 739
PRO
Precision : 0.7792207792207793
Recall : 0.6101694915254238
F1 : 0.6844106463878328
BRA
Precision : 0.9213973799126638
Recall : 0.86652977412731
F1 : 0.893121693121693
TYP
Precision : 0.5107913669064749
Recall : 0.35323383084577115
F1 : 0.4176470588235294
O
Precision : 0.9492138364779874
Recall : 0.9883759004584152
F1 : 0.9683991017003529
Ove

In [None]:
def extractEntity(sentence, labels, conf, limit):
    bra_data = []
    typ_data = []
    pro_data = []
    sen_pro_data = ''
    sen_typ_data = ''
    sen_bra_data = ''
    confB = 0
    confP = 0
    confT = 0
    sizeB = 0
    sizeP = 0
    sizeT = 0
    for j in range(len(sentence)):
        if labels[j] == 'O':
            if sen_pro_data:
                if (confP/sizeP) >= limit:
                    pro_data.append(sen_pro_data)
                confP = 0
                sizeP = 0
                sen_pro_data = ''
            if sen_bra_data:
                if (confB/sizeB) >= limit:
                    bra_data.append(sen_bra_data)
                confB = 0
                sizeB = 0
                sen_bra_data = ''
            if sen_typ_data:
                if (confT/sizeT) >= limit:
                    typ_data.append(sen_typ_data)
                confT = 0
                sizeT = 0
                sen_typ_data = ''
        else:
            if '|' in labels[j]:
                if sen_pro_data and labels[j][0] == 'B':
                    if (confP/sizeP) >= limit:
                        pro_data.append(sen_pro_data)
                    confP = 0
                    sizeP = 0
                    sen_pro_data = ''
                    sen_pro_data += sentence[j]
                    confP += conf[j][labels[j]]
                    sizeP += 1
                elif labels[j][0] == 'B':
                    sen_pro_data += sentence[j]
                    confP += conf[j][labels[j]]
                    sizeP += 1
                elif labels[j][0] == 'I':
                    sen_pro_data += ' ' + sentence[j]
                    confP += conf[j][labels[j]]
                    sizeP += 1
                if labels[j][8:] == 'BRA':
                    if sen_bra_data and labels[j][6:7] == 'B':
                        if (confB/sizeB) >= limit:
                            bra_data.append(sen_bra_data)
                        confB = 0
                        sizeB = 0
                        sen_bra_data = ''
                        sen_bra_data += sentence[j]
                        confB += conf[j][labels[j]]
                        sizeB += 1
                    elif labels[j][6:7] == 'B':
                        sen_bra_data += sentence[j]
                        confB += conf[j][labels[j]]
                        sizeB += 1
                    elif labels[j][6:7] == 'I':
                        sen_bra_data += ' ' + sentence[j]
                        confB += conf[j][labels[j]]
                        sizeB += 1
                elif labels[j][8:] == 'TYP':
                    if sen_typ_data and labels[j][6:7] == 'B':
                        if (confT/sizeT) >= limit:
                            typ_data.append(sen_typ_data)
                        confT = 0
                        sizeT = 0
                        sen_typ_data = ''
                        sen_typ_data += sentence[j]
                        confT += conf[j][labels[j]]
                        sizeT += 1
                    elif labels[j][6:7] == 'B':
                        sen_typ_data += sentence[j]
                        confT += conf[j][labels[j]]
                        sizeT += 1
                    elif labels[j][6:7] == 'I':
                        sen_typ_data += ' ' + sentence[j]
                        confT += conf[j][labels[j]]
                        sizeT += 1
            elif labels[j][2:5] == 'PRO':
                if sen_pro_data and labels[j][0] == 'B':
                    if (confP/sizeP) >= limit:
                        pro_data.append(sen_pro_data)
                    confP = 0
                    sizeP = 0
                    sen_pro_data = ''
                    sen_pro_data += sentence[j]
                    confP += conf[j][labels[j]]
                    sizeP += 1
                elif labels[j][0] == 'B':
                    sen_pro_data += sentence[j]
                    confP += conf[j][labels[j]]
                    sizeP += 1
                elif labels[j][0] == 'I':
                    sen_pro_data += ' ' + sentence[j]
                    confP += conf[j][labels[j]]
                    sizeP += 1
            elif labels[j][2:5] == 'BRA':
                if sen_bra_data and labels[j][0] == 'B':
                    if (confB/sizeB) >= limit:
                        bra_data.append(sen_bra_data)
                    confB = 0
                    sizeB = 0
                    sen_bra_data = ''
                    sen_bra_data += sentence[j]
                    confB += conf[j][labels[j]]
                    sizeB += 1
                elif labels[j][0] == 'B':
                    sen_bra_data += sentence[j]
                    confB += conf[j][labels[j]]
                    sizeB += 1
                elif labels[j][0] == 'I':
                    sen_bra_data += ' ' + sentence[j]
                    confB += conf[j][labels[j]]
                    sizeB += 1
            elif labels[j][2:5] == 'TYP':
                if sen_typ_data and labels[j][0] == 'B':
                    if (confT/sizeT) >= limit:
                        typ_data.append(sen_typ_data)
                    confT = 0
                    sizeT = 0
                    sen_typ_data = ''
                    sen_typ_data += sentence[j]
                    confT += conf[j][labels[j]]
                    sizeT += 1
                elif labels[j][0] == 'B':
                    sen_typ_data += sentence[j]
                    confT += conf[j][labels[j]]
                    sizeT += 1
                elif labels[j][0] == 'I':
                    sen_typ_data += ' ' + sentence[j]
                    confT += conf[j][labels[j]]
                    sizeT += 1
    if sen_pro_data:
        if (confP/sizeP) >= limit:
            pro_data.append(sen_pro_data)
    if sen_bra_data:
        if (confB/sizeB) >= limit:
            bra_data.append(sen_bra_data)
    if sen_typ_data:
        if (confT/sizeT) >= limit:
            typ_data.append(sen_typ_data)
    return [pro_data,bra_data,typ_data]

ii = 16407
a = crf.predict_marginals_single(X_unlabeled_sup[ii])
b = crf.predict_single(X_unlabeled_sup[ii])
pp = extractEntity(unlabeled_sup[ii],b,a,0.98)
print(unlabeled_sup[ii])
print(b)
print(pp)
'''
for z in range(len(X_unlabeled_sup)): #X_test/X_unlabeled
    a = crf.predict_marginals_single(X_unlabeled_sup[z])
    b = crf.predict_single(X_unlabeled_sup[z])
    labelAll.append(b)
    size = len(a)
    highFound = False
    for i in range(size):
        if b[i] != 'O':
            if a[i][b[i]] >= 0.5 and a[i][b[i]] <= 0.6:
                highFound = True
    if highFound:
        print(z)
'''