# Topic Modelling and NER 

## B) Name Entity Recognition

To run this jupyter notebook, the folder "gmb-2.2.0", which contains all the files needed, should exist in the same folder

In [1]:
import os
import collections
import string
import pandas as pd
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

To perform the extraction of the Named Entities from the dataset we make use of the following functions.

The function read_gmb() reads the files, splits the sentences by 2 newline characters, the words by 1 newline character and the annotation by tab character, gets rid of the sub-categories and then for the tuples of the form (word, tag, ner) it has created, calls the to_conll_iob() function. 

The to_conll_iob() function reforms the label into the IOB representation so as to be used in the training of the classifier.

In [3]:
def to_conll_iob(annotated_sentence):
    proper_iob_tokens = []
    for idx, annotated_token in enumerate(annotated_sentence):
        tag, word, ner = annotated_token
 
        if ner != 'O':
            if idx == 0:
                ner = "B-" + ner
            elif annotated_sentence[idx - 1][2] == ner:
                ner = "I-" + ner
            else:
                ner = "B-" + ner
        proper_iob_tokens.append((tag, word, ner))
    return proper_iob_tokens
 

def read_gmb(corpus_root):
    for root, dirs, files in os.walk(corpus_root):
        for filename in files:
            if filename.endswith(".tags"):
                with open(os.path.join(root, filename), 'rb') as file_handle:
                    file_content = file_handle.read().decode('utf-8').strip()
                    annotated_sentences = file_content.split('\n\n')
                    for annotated_sentence in annotated_sentences:
                        annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]
 
                        standard_form_tokens = []
 
                        for idx, annotated_token in enumerate(annotated_tokens):
                            annotations = annotated_token.split('\t')
                            word, tag, ner = annotations[0], annotations[1], annotations[3]
 
                            if ner != 'O':
                                ner = ner.split('-')[0]
 
                            if tag in ('LQU', 'RQU'):   
                                tag = "``"
 
                            standard_form_tokens.append((word, tag, ner))
 
                        conll_tokens = to_conll_iob(standard_form_tokens)
                        yield [((w, t), iob) for w, t, iob in conll_tokens]

In [4]:
# Call the read_gmb() function
corpus_root = "gmb-2.2.0"
reader = read_gmb(corpus_root)

In [5]:
# Transform the output of the above function into a list of tuples to be easily manageable
data = list(reader)

In [6]:
print len(data)
print data[0]

62010
[((u'Thousands', u'NNS'), u'O'), ((u'of', u'IN'), u'O'), ((u'demonstrators', u'NNS'), u'O'), ((u'have', u'VBP'), u'O'), ((u'marched', u'VBN'), u'O'), ((u'through', u'IN'), u'O'), ((u'London', u'NNP'), u'B-geo'), ((u'to', u'TO'), u'O'), ((u'protest', u'VB'), u'O'), ((u'the', u'DT'), u'O'), ((u'war', u'NN'), u'O'), ((u'in', u'IN'), u'O'), ((u'Iraq', u'NNP'), u'B-geo'), ((u'and', u'CC'), u'O'), ((u'demand', u'VB'), u'O'), ((u'the', u'DT'), u'O'), ((u'withdrawal', u'NN'), u'O'), ((u'of', u'IN'), u'O'), ((u'British', u'JJ'), u'B-gpe'), ((u'troops', u'NNS'), u'O'), ((u'from', u'IN'), u'O'), ((u'that', u'DT'), u'O'), ((u'country', u'NN'), u'O'), ((u'.', u'.'), u'O')]


In [17]:
# Split the data from their lables and flatten the lists. The size of the dataset is 1.354.149 samples
datas = [item[k][0] for item in data for k in range(len(item))]
labels = [item[k][1] for item in data for k in range(len(item))]
print datas[0], "  ", len(datas)
print labels[0], "  ", len(labels)

(u'Thousands', u'NNS')    1354149
O    1354149


In [8]:
# Split the above samples into training and test set to evaluate the training of Naive Bayes classifier
training_samples = datas[:int(len(datas) * 0.8)]
training_labels = labels[:int(len(datas) * 0.8)]
test_samples = datas[int(len(datas) * 0.8):]
test_labels = labels[int(len(datas) * 0.8):]
 
print "#training samples = %s" % len(training_samples)    
print "#test samples = %s" % len(test_samples)              

#training samples = 1083319
#test samples = 270830


In [9]:
# Import the data into pandas dataframes to be able to transform them into an input that the classifier 'understands' 
df_training = pd.DataFrame(training_samples)
df_training_labels = pd.DataFrame(training_labels)
df_test = pd.DataFrame(test_samples)
df_test_labels = pd.DataFrame(test_labels)

In [18]:
# Using the LabelEncoder to trasform the labels into numeric values for the classifier
for column in df_training_labels.columns:
    if df_training_labels[column].dtype == type(object):
        le = LabelEncoder()
        df_training_labels[column] = le.fit_transform(df_training_labels[column])

for column in df_test_labels.columns:
    if df_test_labels[column].dtype == type(object):
        le = LabelEncoder()
        df_test_labels[column] = le.fit_transform(df_test_labels[column])

In [23]:
# Create the model, fit the training data and make predictions on the test samples
gnb = GaussianNB()
data_fit = gnb.fit(df_training, np.array(df_training_labels).ravel())
prediction = data_fit.predict(df_test)

In [24]:
# Evaluate the classifier by accuracy
accuracy_score(prediction, df_test_labels)

0.8270391020197172