# Assignment 2

* Implement a linear chain CRF with a library
* Use it on two data sets with the same set of features
* Implement the model and evaluate with F-Score. How are the feature weights which are learnt different between the models?

## File handling and data loading

Loads data from files, for both POS training&testing and NER training&testing files

In [4]:
"""
Notebook Imports
"""
import os

In [5]:
class DataLoader:
    """
    Loads input data files and cleans up data into following format:
    
    eg: (Data1, Type1)
    """
    #
    def __init__(self, POS_train_path, POS_test_path, NER_train_path, NER_test_path):
        """
        Loads data files into memory
        """
        __POS_train_path = POS_train_path
        __POS_test_path = POS_test_path
        __NER_train_path = NER_train_path
        __NER_test_path = NER_test_path
        #
        # Loads data from disk into memory
        __POS_train_data, __POS_test_data, __NER_train_data, __NER_test_data = self.__extract_data_from_files(POS_train_path=__POS_train_path,
                                                                                                              POS_test_path=__POS_test_path,
                                                                                                              NER_train_path=__NER_train_path,
                                                                                                              NER_test_path=__NER_test_path)
        #
        # Formats data currently loaded in memory into a formatted structure
        self.__POS_train_structured_data, self.__POS_test_structured_data, self.__NER_train_structured_data,self.__NER_test_structured_data=self.__format_data(POS_train_data=__POS_train_data,
                                                                                                                                                               POS_test_data=__POS_test_data,
                                                                                                                                                               NER_train_data=__NER_train_data,
                                                                                                                                                               NER_test_data=__NER_test_data)
    #
    def __extract_data_from_files(self, POS_train_path, POS_test_path, NER_train_path, NER_test_path):
        """
        Opens data files and returns data in memory
        """
        #
        BASE_DIR = os.path.join( os.path.dirname(os.getcwd()))
        #
        with open(BASE_DIR + POS_train_path) as f:
            POS_train_data = f.read()
        with open(BASE_DIR + POS_test_path) as f:
            POS_test_data = f.read()
        with open(BASE_DIR + NER_train_path) as f:
            NER_train_data = f.read()
        with open(BASE_DIR + NER_test_path) as f:
            NER_test_data = f.read()
        #
        return POS_train_data, POS_test_data, NER_train_data, NER_test_data
    #
    def __format_data(self, POS_train_data, POS_test_data, NER_train_data, NER_test_data):
        """
        Formats data into a python data structure (list of lists)
        eg: (Data1, Type1)
        """
        POS_train_structured_data, POS_test_structured_data, NER_train_structured_data, NER_test_structured_data = [],[],[],[]
        for line in POS_train_data.split("\n"):
            #print(line)
            if line is not None and line != "": 
                sub_list = line.split("\t")
                POS_train_structured_data.append(sub_list)
        for line in POS_test_data.split("\n"):
            if line is not None and line != "": 
                sub_list = line.split("\t")
                POS_test_structured_data.append(sub_list)
        for line in NER_train_data.split("\n"):
            if line is not None and line != "": 
                sub_list = line.split("\t|")
                NER_train_structured_data.append(sub_list)
        for line in NER_test_data.split("\n"):
            if line is not None and line != "": 
                sub_list = line.split("\t|")
                NER_test_structured_data.append(sub_list)
        #
        return POS_train_structured_data, POS_test_structured_data, NER_train_structured_data, NER_test_structured_data
    #
    def load_data(self):
        """
        Method wrapper which loads data into memory and returns all relevant training and testing files
        """
        return self.__POS_train_structured_data, self.__POS_test_structured_data, self.__NER_train_structured_data, self.__NER_test_structured_data
#
data_loader_obj = DataLoader(POS_train_path="\\data\\pos\\train.col",
                             POS_test_path="\\data\\pos\\test.col",
                             NER_train_path="\\data\\ner-pol\\train.iob",
                             NER_test_path="\\data\\ner-pol\\test.iob")
POS_train_data, POS_test_data, NER_train_data, NER_test_data = data_loader_obj.load_data()
print("Example: " + str(POS_test_data[0:5]))

Example: [['Measuring', 'NN'], ['cups', 'NNS'], ['may', 'MD'], ['soon', 'RB'], ['be', 'VB']]


## Feature Enhancer

Adds more features to the already loaded vectors, including the following features:

* The word itself, coverted to lower case
* Word Suffix (-2,-3)
* Boolean if string is uppercased
* Boolean if string is a title (eg: Title)
* Boolean if string is a digit
* The postag
* The string before it, converted to lowercase
* Boolean if the string before it is a title (eg: Title)
* Boolean if the string before it is uppercased
* Boolean if the string before it is a digit
* postag of string before it
* The string after it, converted to lowercase
* Boolean if the string after it is a title (eg: Title)
* Boolean if the string after it is uppercased
* Boolean if the string after it is a digit
* postag of string after it

In [6]:
def word2features(doc, i):
    """
    Accepts a word, and respective POS tag, and converts it into a vector of features
    """
    word = doc[i][0]
    postag = doc[i][1]
    #
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]' + word[-3:],
        'word[-2:]' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]
    #
    # Preword
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Beginning of document
        features.append('BOS')
    #
    # Postword
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # End of document
        features.append('EOS')
    #    
    return features
#
# Function for extracting features & labels
def extract_features_labels(data):
    X, y = [], []
    for i in range(len(data)):
        X.append(word2features(data, i))
        y.append(data[i][1])
    return X,y
#
X_POS_train_data, y_POS_train_data = extract_features_labels(POS_train_data)
print(X_POS_train_data[0:5])
print(y_POS_train_data[0:5])

[['bias', 'word.lower=in', 'word[-3:]In', 'word[-2:]In', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=IN', 'BOS', '+1:word.lower=an', '+1:word.istitle=False', '+1:word.isupper=False', '+1:word.isdigit=False', '+1:postag=DT'], ['bias', 'word.lower=an', 'word[-3:]an', 'word[-2:]an', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'postag=DT', '-1:word.lower=in', '-1:word.istitle=True', '-1:word.isupper=False', '-1:word.isdigit=False', '-1:postag=IN', '+1:word.lower=oct.', '+1:word.istitle=True', '+1:word.isupper=False', '+1:word.isdigit=False', '+1:postag=NNP'], ['bias', 'word.lower=oct.', 'word[-3:]ct.', 'word[-2:]t.', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=NNP', '-1:word.lower=an', '-1:word.istitle=False', '-1:word.isupper=False', '-1:word.isdigit=False', '-1:postag=DT', '+1:word.lower=19', '+1:word.istitle=False', '+1:word.isupper=False', '+1:word.isdigit=True', '+1:postag=CD'], ['bias', 'word.lower=19', '