# Basic System

This notebook provides code for implementing a very simple machine learning system for named entity recognition.
It uses logistic regression and one feature (the token itself). 

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import sys

In [2]:
def extract_features_and_labels(trainingfile: str):
    '''
    This function extracts features and their labels from a file.
    
    :param trainingfile: path to file with training data
    
    :returns: a list of extracted features, and a list of extracted target labels
    '''
    data = []
    targets = []
    with open(trainingfile, 'r', encoding='utf8') as infile:
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) > 0:
                token = components[0]
                feature_dict = {'token':token}
                data.append(feature_dict)
                #gold is in the last column
                targets.append(components[-1])
    return data, targets

In [3]:
def extract_features(inputfile: str):
    '''
    This function extracts features from a file
    
    :param inputfile: path to file with input data
    
    :returns: a list of extracted features
    '''
    data = []
    with open(inputfile, 'r', encoding='utf8') as infile:
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) > 0:
                token = components[0]
                feature_dict = {'token':token}
                data.append(feature_dict)
    return data

In [4]:
def create_classifier(train_features, train_targets):
    '''
    This function creates a logistic regression classifier which is trained on the provided training data
    
    :param train_features: a list of training data features
    :param train_targets: a list of training data target labels
    
    :returns: a trained logistic regression model and a DictVectorizer class which can be used to transform 
              feature representations so that they can be used as inputs for the model
    '''
    logreg = LogisticRegression(solver='liblinear')
    vec = DictVectorizer()
    #create sparse vectors for the training feautes
    features_vectorized = vec.fit_transform(train_features)
    model = logreg.fit(features_vectorized, train_targets)
    
    return model, vec

In [5]:
def classify_data(model, vec, inputdata: str, outputfile: str):
    '''
    This function classifies data and saves the output
    
    :param model: a logistic regression model 
    :param vec: a DictVectorizer class which can be used to transform feature representations to vectors
    :param inputdata: path to file with input data
    :param outputfile: path to file in which model predictions are saved
    '''  
    features = extract_features(inputdata)
    features = vec.transform(features)
    #predict labels of input features
    predictions = model.predict(features)
    #save predicitions in output file
    outfile = open(outputfile, 'w')
    counter, firstline = 0, True
    for line in open(inputdata, 'r'):
        #insert a header to the newly added column
        if firstline:
            lastcolumn = int(line.rstrip('\n')[-1])
            new_colum = str(lastcolumn + 1)
            outfile.write(line.rstrip('\n') + '\t' + new_colum  + '\n')
            firstline = False
            continue
        #add the predictions as a new column
        if len(line.rstrip('\n').split()) > 0:
            outfile.write(line.rstrip('\n') + '\t' + predictions[counter] + '\n')
            counter += 1
    outfile.close()

In [6]:
def main(argv=None):
    
    #a very basic way for picking up commandline arguments
    if argv is None:
        argv = sys.argv

    trainingfile = argv[1]
    inputfile = argv[2]
    outputfile = argv[3]
    
    #extract features
    training_features, gold_labels = extract_features_and_labels(trainingfile)
    #create model
    ml_model, vec = create_classifier(training_features, gold_labels)
    #classify and save data
    classify_data(ml_model, vec, inputfile, outputfile)

In [8]:
#create the model, training on the conll 2003 train data, and use the preprocessed dev data as test data
args = ['python', '../data/conll2003.train.conll', '../data/conll2003.dev-preprocessed.conll', '../data/logistic_basic_output.conll']
main(args)