# Assignment 2

* Implement a linear chain CRF with a library
* Use it on two data sets with the same set of features
* Implement the model and evaluate with F-Score. How are the feature weights which are learnt different between the models?

## File handling and data loading

Loads data from files, for both POS training&testing and NER training&testing files

In [53]:
"""
Notebook Imports
"""
import os
import pycrfsuite # pip install python-crfsuite
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [54]:
class DataLoader:
    """
    Loads input data files and cleans up data into following format:
    
    eg: (Data1, Type1)
    """
    #
    def __init__(self, POS_train_path, POS_test_path, NER_train_path, NER_test_path):
        """
        Loads data files into memory
        """
        __POS_train_path = POS_train_path
        __POS_test_path = POS_test_path
        __NER_train_path = NER_train_path
        __NER_test_path = NER_test_path
        #
        # Loads data from disk into memory
        __POS_train_data, __POS_test_data, __NER_train_data, __NER_test_data = self.__extract_data_from_files(POS_train_path=__POS_train_path,
                                                                                                              POS_test_path=__POS_test_path,
                                                                                                              NER_train_path=__NER_train_path,
                                                                                                              NER_test_path=__NER_test_path)
        #
        # Formats data currently loaded in memory into a formatted structure
        self.__POS_train_structured_data, self.__POS_test_structured_data, self.__NER_train_structured_data,self.__NER_test_structured_data=self.__format_data(POS_train_data=__POS_train_data,
                                                                                                                                                               POS_test_data=__POS_test_data,
                                                                                                                                                               NER_train_data=__NER_train_data,
                                                                                                                                                               NER_test_data=__NER_test_data)
    #
    def __extract_data_from_files(self, POS_train_path, POS_test_path, NER_train_path, NER_test_path):
        """
        Opens data files and returns data in memory
        """
        #
        BASE_DIR = os.path.join( os.path.dirname(os.getcwd()))
        #
        with open(BASE_DIR + POS_train_path) as f:
            POS_train_data = f.read()
        with open(BASE_DIR + POS_test_path) as f:
            POS_test_data = f.read()
        with open(BASE_DIR + NER_train_path) as f:
            NER_train_data = f.read()
        with open(BASE_DIR + NER_test_path) as f:
            NER_test_data = f.read()
        #
        return POS_train_data, POS_test_data, NER_train_data, NER_test_data
    #
    def __format_data(self, POS_train_data, POS_test_data, NER_train_data, NER_test_data):
        """
        Formats data into a python data structure (list of lists)
        eg: (Data1, Type1)
        """
        POS_train_structured_data, POS_test_structured_data, NER_train_structured_data, NER_test_structured_data = [],[],[],[]
        for line in POS_train_data.split("\n"):
            #print(line)
            if line is not None and line != "": 
                sub_list = line.split("\t")
                POS_train_structured_data.append(sub_list)
        for line in POS_test_data.split("\n"):
            if line is not None and line != "": 
                sub_list = line.split("\t")
                POS_test_structured_data.append(sub_list)
        for line in NER_train_data.split("\n"):
            if line is not None and line != "": 
                sub_list = line.split("\t|")
                NER_train_structured_data.append(sub_list)
        for line in NER_test_data.split("\n"):
            if line is not None and line != "": 
                sub_list = line.split("\t|")
                NER_test_structured_data.append(sub_list)
        #
        return POS_train_structured_data, POS_test_structured_data, NER_train_structured_data, NER_test_structured_data
    #
    def load_data(self):
        """
        Method wrapper which loads data into memory and returns all relevant training and testing files
        """
        return self.__POS_train_structured_data, self.__POS_test_structured_data, self.__NER_train_structured_data, self.__NER_test_structured_data
#
data_loader_obj = DataLoader(POS_train_path="\\data\\pos\\train.col",
                             POS_test_path="\\data\\pos\\test.col",
                             NER_train_path="\\data\\ner-pol\\train.iob",
                             NER_test_path="\\data\\ner-pol\\test.iob")
POS_train_data, POS_test_data, NER_train_data, NER_test_data = data_loader_obj.load_data()
print("Example: " + str(POS_train_data[0:5]))
print("Example: " + str(POS_test_data[0:5]))
print("Example: " + str(NER_train_data[0:5]))
print("Example: " + str(NER_test_data[0:5]))

Example: [['In', 'IN'], ['an', 'DT'], ['Oct.', 'NNP'], ['19', 'CD'], ['review', 'NN']]
Example: [['Measuring', 'NN'], ['cups', 'NNS'], ['may', 'MD'], ['soon', 'RB'], ['be', 'VB']]
Example: [['-DOCSTART-', 'O'], ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O']]
Example: [['-DOCSTART-', 'O'], ['SOCCER', 'O'], ['-', 'O'], ['JAPAN', 'B-LOC'], ['GET', 'O']]


## Feature Enhancer

Adds more features to the already loaded vectors, including the following features:

* The word itself, coverted to lower case
* Word Suffix (-2,-3)
* Boolean if string is uppercased
* Boolean if string is a title (eg: Title)
* Boolean if string is a digit
* The postag
* The string before it, converted to lowercase
* Boolean if the string before it is a title (eg: Title)
* Boolean if the string before it is uppercased
* Boolean if the string before it is a digit
* postag of string before it
* The string after it, converted to lowercase
* Boolean if the string after it is a title (eg: Title)
* Boolean if the string after it is uppercased
* Boolean if the string after it is a digit
* postag of string after it

In [55]:
def word2features(doc, i, model_type):
    """
    Accepts a word, and respective POS tag, and converts it into a vector of features
    
    Applies different features to the model, depending on whether we are tackling POS tagging, or NER tagging.
    """
    #print(doc[i])
    word = doc[i][0]
    postag = doc[i][1]
    #
    if model_type == 0: # POS Tagging Features
        #
        features = [
            'bias',
            'word.lower=' + word.lower(),
            'word[-3:]=' + word[-3:],
            'word[-2:]=' + word[-2:],
            'word.isupper=%s' % word.isupper(),
            'word.istitle=%s' % word.istitle(),
            'word.isdigit=%s' % word.isdigit(),
            'postag=' + postag
        ]
        #
        # Preword
        if i > 0:
            word1 = doc[i-1][0]
            postag1 = doc[i-1][1]
            features.extend([
                '-1:word.lower=' + word1.lower(),
                '-1:word.istitle=%s' % word1.istitle(),
                '-1:word.isupper=%s' % word1.isupper(),
                '-1:word.isdigit=%s' % word1.isdigit(),
                '-1:postag=' + postag1
            ])
        else:
            # Beginning of document
            features.append('BOS')
        #
        # Postword
        if i < len(doc)-1:
            word1 = doc[i+1][0]
            postag1 = doc[i+1][1]
            features.extend([
                '+1:word.lower=' + word1.lower(),
                '+1:word.istitle=%s' % word1.istitle(),
                '+1:word.isupper=%s' % word1.isupper(),
                '+1:word.isdigit=%s' % word1.isdigit(),
                '+1:postag=' + postag1
            ])
        else:
            # End of document
            features.append('EOS')
    else:
        #
        features = [
            'bias',
            'word.lower=' + word.lower(),
            'word[-3:]=' + word[-3:],
            'word[-2:]=' + word[-2:],
            'word.isupper=%s' % word.isupper(),
            'word.istitle=%s' % word.istitle(),
            'word.isdigit=%s' % word.isdigit(),
            'postag=' + postag
        ]
        #
        # Preword
        if i > 0:
            word1 = doc[i-1][0]
            postag1 = doc[i-1][1]
            features.extend([
                '-1:word.lower=' + word1.lower(),
                '-1:word.istitle=%s' % word1.istitle(),
                '-1:word.isupper=%s' % word1.isupper(),
                '-1:word.isdigit=%s' % word1.isdigit(),
                '-1:postag=' + postag1
            ])
        else:
            # Beginning of document
            features.append('BOS')
        #
        # Postword
        if i < len(doc)-1:
            word1 = doc[i+1][0]
            postag1 = doc[i+1][1]
            features.extend([
                '+1:word.lower=' + word1.lower(),
                '+1:word.istitle=%s' % word1.istitle(),
                '+1:word.isupper=%s' % word1.isupper(),
                '+1:word.isdigit=%s' % word1.isdigit(),
                '+1:postag=' + postag1
            ])
        else:
            # End of document
            features.append('EOS')
    #    
    return features
#
# A function for extracting features in documents
def extract_features(doc, model_type):
    return [word2features(doc, i, model_type) for i in range(len(doc))]
#
# A function for generating the list of labels for each document
def get_labels(doc):
    return [label for (token, label) in doc]
#
# A function for generating the list of features for each document
def get_features(doc):
    return [token for (token, label) in doc]
#
print(POS_train_data[:5])
print(NER_train_data[:5])
print("---------------------------------------------------------------------------------")
X_POS_train_data = extract_features(POS_train_data, 0)
y_POS_train_data = get_labels(POS_train_data)
X_NER_train_data = extract_features(NER_train_data, 1)
y_NER_train_data = get_labels(NER_train_data)
print("POS Train Data Snippet:")
print(X_POS_train_data[:5])
print(y_POS_train_data[:5])
print("NER Train Data Snippet:")
print(X_NER_train_data[:5])
print(y_NER_train_data[:5])

[['In', 'IN'], ['an', 'DT'], ['Oct.', 'NNP'], ['19', 'CD'], ['review', 'NN']]
[['-DOCSTART-', 'O'], ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O']]
---------------------------------------------------------------------------------
POS Train Data Snippet:
[['bias', 'word.lower=in', 'word[-3:]=In', 'word[-2:]=In', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=IN', 'BOS', '+1:word.lower=an', '+1:word.istitle=False', '+1:word.isupper=False', '+1:word.isdigit=False', '+1:postag=DT'], ['bias', 'word.lower=an', 'word[-3:]=an', 'word[-2:]=an', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'postag=DT', '-1:word.lower=in', '-1:word.istitle=True', '-1:word.isupper=False', '-1:word.isdigit=False', '-1:postag=IN', '+1:word.lower=oct.', '+1:word.istitle=True', '+1:word.isupper=False', '+1:word.isdigit=False', '+1:postag=NNP'], ['bias', 'word.lower=oct.', 'word[-3:]=ct.', 'word[-2:]=t.', 'word.isupper=False', 'word.istitle=True', '

## Training Model - POS

In [56]:
file_path = "POS_crf.model"
file_exists = os.path.exists(file_path)
if(not file_exists):
    trainer_POS = pycrfsuite.Trainer(verbose=True)
    #
    # Submit training data to the trainer
    for xseq, yseq in zip(X_POS_train_data, y_POS_train_data):
        #print(xseq)
        #print(yseq)
        trainer_POS.append([xseq], [yseq])
    # print(len(X_POS_train_data))
    # print(len(y_POS_train_data))
    # trainer_POS.append(X_POS_train_data, y_POS_train_data)
    #
    # Set the parameters of the model
    trainer_POS.set_params({
        # coefficient for L1 penalty
        'c1': 0.1,

        # coefficient for L2 penalty
        'c2': 0.01,  

        # maximum number of iterations
        'max_iterations': 100,

        # whether to include transitions that
        # are possible, but not observed
        'feature.possible_transitions': True
    })
    #
    # Provide a file name as a parameter to the train function, such that
    # the model will be saved to the file when training is finished
    trainer_POS.train(file_path)
    print('POS Model Trained..')

## Training Model - NER

In [57]:
file_path = "NER_crf.model"
file_exists = os.path.exists(file_path)
if(not file_exists):
    trainer_NER = pycrfsuite.Trainer(verbose=True)
    #
    # Submit training data to the trainer
    for xseq, yseq in zip(X_NER_train_data, y_NER_train_data):
        #print(xseq)
        #print(yseq)
        trainer_NER.append([xseq], [yseq])
    # print(len(X_POS_train_data))
    # print(len(y_POS_train_data))
    # trainer_POS.append(X_POS_train_data, y_POS_train_data)
    #
    # Set the parameters of the model
    trainer_NER.set_params({
        # coefficient for L1 penalty
        'c1': 0.1,

        # coefficient for L2 penalty
        'c2': 0.01,  

        # maximum number of iterations
        'max_iterations': 100,

        # whether to include transitions that
        # are possible, but not observed
        'feature.possible_transitions': True
    })
    #
    # Provide a file name as a parameter to the train function, such that
    # the model will be saved to the file when training is finished
    trainer_NER.train(file_path)
    print('NER Model Trained..')

## Running models on Test Data

In [58]:
X_POS_test_data = get_features(POS_test_data)
y_POS_test_data = get_labels(POS_test_data)
X_NER_test_data = get_features(NER_test_data)
y_NER_test_data = get_labels(NER_test_data)
print("POS Train Data Snippet:")
print(X_POS_test_data[:5])
print(y_POS_test_data[:5])
print("NER Train Data Snippet:")
print(X_NER_test_data[:5])
print(y_NER_test_data[:5])
print("--------------------")
#
# Running test data on POS model   
tagger_POS = pycrfsuite.Tagger()
tagger_POS.open('POS_crf.model')
y_POS_pred = [tagger_POS.tag([xseq]) for xseq in X_POS_test_data]
#
# Running test data on NER model   
tagger_NER = pycrfsuite.Tagger()
tagger_NER.open('NER_crf.model')
y_NER_pred = [tagger_NER.tag([xseq]) for xseq in X_NER_test_data]
#
print("POS Pred Data Snippet:")
print(y_POS_pred[:5])
print("NER Pred Data Snippet:")
print(y_NER_pred[:5])

POS Train Data Snippet:
['Measuring', 'cups', 'may', 'soon', 'be']
['NN', 'NNS', 'MD', 'RB', 'VB']
NER Train Data Snippet:
['-DOCSTART-', 'SOCCER', '-', 'JAPAN', 'GET']
['O', 'O', 'O', 'B-LOC', 'O']
--------------------
POS Pred Data Snippet:
[['IN'], ['IN'], ['IN'], ['IN'], ['IN']]
NER Pred Data Snippet:
[['O'], ['O'], ['O'], ['O'], ['O']]


## Evaluation

In [59]:
average='weighted'
#
# POS Evaluation
accuracy = accuracy_score(y_POS_test_data, y_POS_pred) * 100
precision = precision_score(y_POS_test_data, y_POS_pred, average=average) * 100
recall = recall_score(y_POS_test_data, y_POS_pred, average=average) * 100
f1_s = f1_score(y_POS_test_data, y_POS_pred, average=average) * 100
print("POS Accuracy: " + str(accuracy))
print("POS Precision: " + str(precision))
print("POS Recall: " + str(recall))
print("POS F1 Score: " + str(f1_s) + "\n")
#
# NER Evaluation
accuracy = accuracy_score(y_NER_test_data, y_NER_pred) * 100
precision = precision_score(y_NER_test_data, y_NER_pred, average=average) * 100
recall = recall_score(y_NER_test_data, y_NER_pred, average=average) * 100
f1_s = f1_score(y_NER_test_data, y_NER_pred, average=average) * 100
print("NER Accuracy: " + str(accuracy))
print("NER Precision: " + str(precision))
print("NER Recall: " + str(recall))
print("NER F1 Score: " + str(f1_s))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


POS Accuracy: 10.7968534198
POS Precision: 1.16572043769
POS Recall: 10.7968534198
POS F1 Score: 2.10424827368

NER Accuracy: 82.6168945271
NER Precision: 68.255512613
NER Recall: 82.6168945271
NER F1 Score: 74.7526813329
