In [4]:
# !pip install sklearn_crfsuite

In [5]:
from itertools import chain
import pandas as pd
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
# import nltk


## Read and preprocess the data
We strongly suggest that you can add pos tags use nltk.pos_tag() function. You can find more information about pos tags in the following link: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html




In [6]:
# Read the data
def read_data(filename):
    rows = []
    with open(f'./ner/GMB/{filename}') as f:
        for line in f.readlines():
            if len(line) < 2:
                continue
            rows.append(line.rstrip('\n').split())
    data = pd.DataFrame(rows, columns=['term', 'entitytags'])
    # add the pos tags to the dataframe
    # some lines of codes
    # print(data.head())
    tokens=[]
    for word in data['term']:
        tokens.append(word)
    tokens2=nltk.pos_tag(tokens)
    # print(data.tail())
    # print(tokens2)
    tokens3=[]
    for x in tokens2:
            tokens3.append(x[1])
    data['pos'] = tokens3
    # print(data.tail())
    # print(tokens2)
    return data


In [7]:
train = read_data('train')
test = read_data('test')
dev = read_data('dev')


In [8]:
# process to get the train, test, dev dataset for crf

def process_data(data):
    dataset = []
    sent = []
    for i, (term, entitytags,pos) in data.iterrows():
        if term == '.':
            sent.append((term, entitytags,pos))
            dataset.append(sent)
            sent = []
        else:
            sent.append((term, entitytags,pos))
    return dataset


In [9]:
train_sents = process_data(train)
test_sents = process_data(test)
dev_sents = process_data(dev)


## The following function will design the feature for crf model. 
You need to add additional features to this function. The potential features you can add are:
1. The characters of the word
2. The pos tag of the word
3. the word before and after the current word

There will also be other features that you can add to improve the performance of the model.


In [10]:
import numpy as np
import gensim
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'src/glove.6B/glove.6B.300d.txt'  # path to the GloVe model file
word2vec_output_file = 'src/glove.6B/glove.6B.300d.word2vec'  # path to the output word2vec model file

# glove2word2vec(glove_input_file, word2vec_output_file)
# Load vectors directly from the file
model1 = gensim.models.KeyedVectors.load_word2vec_format('src/glove.6B/glove.6B.300d.word2vec',binary=False) ### Loading pre-trainned word2vec model
### Embedding function 
def get_features(word):
    word=word.lower()
    try:
         vector=model1[word]
    except:
        # if the word is not in vocabulary,
        # returns zeros array
        vector=np.zeros(300,)

    return vector
def word2features(sent, i):
    word = sent[i][0]
    emd = get_features(word)
    """
    
    Here we have already provided you some of the features. Without any modification, you can run the code and get the baseline result.
    However, the performance is not good. We suggest you to add more features to improve the performance.
    For example, you can add the character level features, the word shape features, the word embedding features, etc.
    We also suggest you to add the features of the previous word and the next word.
    
    We strongly suggests you to add pos tags as features. You can use the pos tags provided by nltk.pos_tag() to get the pos tags of the words.
    
    """
    prev_word = sent[i-1][0]
    vowels = [word.count(x) for x in "aeiouAEIOU"]
    # next_word = sent[i+1][0]

    # Load pre-trained word embeddingg

    features = {
        'word.lower()': word.lower(),
        # add more features here
        'pos_tags': sent[i][2],
        'word_len': len(word),
        'capital_start': word[0].isupper(),
        'count_vowels': sum(vowels),
        'word.isdigit': word.isdigit(),
        'word.istitle': word.istitle(),
        # 'current_word_title':word[0].title(),
        'prev_word_len':len(prev_word),
        'prev_word.lower':prev_word.lower(),
        # 'word_emb': emb,
        # 'prev_word_emb': prev_embedding,
        'prev_word_tag':sent[i-1][2],
        'prev_word_capital':prev_word[0].isupper(),
        'prev_word.istitle':prev_word.istitle(),
        'prev_word.isdigit': prev_word.isdigit()
        # 'next_word.isdigit':next_word.isdigit(),
        # 'next_word.istitle':next_word.istitle()
    }
    for iv,value in enumerate(emd):
        features['word_emd{}'.format(iv)]=value
    
    if i < len(sent)-1:
        next_word = sent[i+1][0]
        features.update({
            'next_word.isdigit':next_word.isdigit(),
            'next_word.istitle':next_word.istitle()
        })

    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, label,pos in sent]


def sent2tokens(sent):
    return [token for token, label,pos in sent]


In [11]:
# sent2features(train_sents[0])[0]


In [12]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]


: 

: 

The following block of code will help you visualize the feature for a given sentence.

In [None]:
X_train[0]

[{'word.lower()': 'masked'},
 {'word.lower()': 'assailants'},
 {'word.lower()': 'with'},
 {'word.lower()': 'grenades'},
 {'word.lower()': 'and'},
 {'word.lower()': 'automatic'},
 {'word.lower()': 'weapons'},
 {'word.lower()': 'attacked'},
 {'word.lower()': 'a'},
 {'word.lower()': 'wedding'},
 {'word.lower()': 'party'},
 {'word.lower()': 'in'},
 {'word.lower()': 'southeastern'},
 {'word.lower()': 'turkey'},
 {'word.lower()': ','},
 {'word.lower()': 'killing'},
 {'word.lower()': '45'},
 {'word.lower()': 'people'},
 {'word.lower()': 'and'},
 {'word.lower()': 'wounding'},
 {'word.lower()': 'at'},
 {'word.lower()': 'least'},
 {'word.lower()': 'six'},
 {'word.lower()': 'others'},
 {'word.lower()': '.'}]

In the following block of code, we use try and except because the version of the library.

In [None]:

crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

This block of code will help you visualize the learned features for crf model.

In [None]:
labels = list(crf.classes_)
labels.remove('O')
# labels


In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)


0.7766174794313391

In [None]:
words = [sent2tokens(s) for s in test_sents]


In [None]:
labels = [sent2labels(s) for s in test_sents]


In [None]:
predictions = []
for (word, true_id, pred_id) in zip(words, labels, y_pred):
    for (w, t, p) in zip(word, true_id, pred_id):
        line = ' '.join([w, t, p])
        predictions.append(line)
    predictions.append('')
with open('crf_pred', 'w') as f:
    f.write('\n'.join(predictions))
          

In [None]:
import os
eval_script = '../released/src/conlleval'
predf = 'crf_pred'
scoref = 'crf_score'
os.system('%s < %s > %s' % (eval_script, predf, scoref))


0

In [None]:
eval_lines = [l.rstrip() for l in open(scoref, 'r', encoding='utf8')]

for i, line in enumerate(eval_lines):
    print(line)

processed 12005 tokens with 1163 phrases; found: 880 phrases; correct: 566.
accuracy:  89.81%; precision:  64.32%; recall:  48.67%; FB1:  55.41
          company: precision:  73.42%; recall:  53.95%; FB1:  62.20  158
         facility: precision:  67.69%; recall:  64.71%; FB1:  66.17  65
          geo-loc: precision:  74.43%; recall:  65.88%; FB1:  69.89  262
            movie: precision: 100.00%; recall:  14.29%; FB1:  25.00  2
      musicartist: precision:  54.17%; recall:  23.21%; FB1:  32.50  24
            other: precision:  52.05%; recall:  38.78%; FB1:  44.44  146
           person: precision:  55.88%; recall:  54.91%; FB1:  55.39  170
          product: precision:  50.00%; recall:  13.98%; FB1:  21.85  26
       sportsteam: precision:  48.00%; recall:  30.77%; FB1:  37.50  25
           tvshow: precision:   0.00%; recall:   0.00%; FB1:   0.00  2


## Let's check what classifier learned:

You will need to analyze how the transition the model is learned in the report.

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-movie -> I-movie 5.701027
B-musicartist -> I-musicartist 5.685554
B-facility -> I-facility 5.433560
B-tvshow -> I-tvshow 5.124462
I-tvshow -> I-tvshow 5.087995
B-sportsteam -> I-sportsteam 5.070978
I-musicartist -> I-musicartist 4.959611
I-movie -> I-movie 4.915102
B-person -> I-person 4.469746
I-product -> I-product 4.219256
B-company -> I-company 4.214045
B-other -> I-other 4.149609
I-facility -> I-facility 4.102881
B-product -> I-product 4.084997
I-company -> I-company 4.007464
I-other -> I-other 3.847204
B-geo-loc -> I-geo-loc 3.835153
I-sportsteam -> I-sportsteam 3.246626
O      -> O       2.816641
I-geo-loc -> I-geo-loc 2.522798

Top unlikely transitions:
I-person -> I-product -2.034085
B-geo-loc -> I-company -2.034881
I-other -> I-product -2.041728
B-person -> I-other -2.109488
I-geo-loc -> I-other -2.161472
B-geo-loc -> I-facility -2.285191
B-company -> I-other -2.335745
I-person -> I-other -2.406683
I-person -> B-person -2.424812
B-geo-loc -> I-other 

## Check the state features:

You will need to analyze how your features will help the model to learn the correct labels.

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
6.156966 B-company word.lower():#talktalk
5.709716 B-company word.lower():http://web.com
5.171185 B-geo-loc word.lower():#chicago
4.654741 B-geo-loc word.lower():#la
4.649690 O        word.lower():rt
4.505379 B-company word.lower():zendesk
4.458371 B-other  word.lower():#isis
4.285988 B-facility word.lower():#revelryroom
4.238361 B-tvshow word.lower():#bb11
4.163233 B-product word.lower():#theincredibletruestory
4.137834 B-sportsteam word.lower():#bufvsphi
4.119632 B-musicartist word.lower():#dreamlabrobot
4.119521 B-sportsteam prev_word.lower:vs
4.075986 B-geo-loc word.lower():china
4.058030 I-geo-loc word.lower():mary's.
4.039027 B-other  word.lower():#lds
4.015493 B-facility word.lower():#dcnavyyard
3.992438 B-movie  word.lower():#fridaythe13th
3.974290 B-company word.lower():#katv7
3.969215 B-company word.lower():linode
3.938312 B-other  word.lower():christmas
3.935788 B-other  word.lower():#daca
3.899440 B-facility prev_word.lower:at
3.874722 B-company word.lower():s