In [1]:
import copy

In [2]:
trainfile = 'data/en_ewt-up-train.conllu'
devfile = 'data/en_ewt-up-dev.conllu'
testfile = 'data/en_ewt-up-test.conllu'

## Read input

In [3]:
def read_conll(conllfile):
    """
    This function read and process the conllu file into list of sentences lists.
    """
    with open(conllfile, 'r', encoding='utf8') as infile:
        fulllist, sentlist = [],[]
        for line in infile:
            line = line.strip()
            if (line != '\n') & (line.startswith("#") == False): # Not empty and not commented
                sentlist.append(line.split())
            if line.startswith("#") == True:
                sentlist = [i for i in sentlist if i] # Remove empty list
                fulllist.append(sentlist)
                sentlist = []
                continue
        res = [ele for ele in fulllist if ele != []] # remove empty list
    return res

In [6]:
trainlist = read_conll(trainfile)
devlist = read_conll(devfile)
testlist = read_conll(testfile)

## Preprocess
Extract features from dataset and duplicate sentences with multiple predicate

In [5]:
def preprocess_list(conlllist):
    """
    This function preprocess the lists into list of sentences list.
    Each sentence list is a list of token lists. Each token list have 13 columns.
    If a sentence have 0 predicates, the column (list item) 12 and 13 (list[11] and list[12]) are set as None.
    If the sentence have multiple predicates, it will be duplicated to align the column number.
    """
    sentlist = []
    for sentence in conlllist:
        sents = [ [] for _ in range(50) ] # Initialize a large empty list for multiple predicate sentence    
        
        for x in range(len(sentence)): # replace 'for components in sentence' that brings duplicate removal error
            components = []
            for y in range(len(sentence[x])):
                components.append(str(sentence[x][y]))

            # First 11 lines
            for i in range(0,10):
                try:
                    tokendict = {"ID":components[0], "form":components[1], "lemma":components[2], "upos":components[3], "xpos":components[4], "feats":components[5], "head":components[6], 
                             "deprel":components[7], "deps":components[8], "misc":components[9], "pred":components[10]}
                except IndexError: # Wrong sentence in the dataset that have no column 11
                    tokendict['pred'] = '_'

            # If sentence have no predicate: assign the values '_'
            if len(components) <= 11: 
                tokendict['V'], tokendict['ARG'] ,tokendict['dup'] = '_','_','_'
                sents[0].append(tokendict)

            # Sentence have one or more predicate
            if len(components) > 11: 
                dup = len(components)-11 # Times for dpulication
                for k in range(0, dup):
                    tokendictk = copy.deepcopy(tokendict)
                    tokendictk['dup'] = k
                    ARGV = components[k+11]
                    # Following conditons change 'pred' (and ARG, V also) entry for duplicated sentence
                    if ARGV == 'V':
                        tokendictk['V'],tokendictk['ARG'] = 'V','_'
                        try:
                            tokendictk['pred'] = sentence[int(tokendictk['ID'])-1][10]
                        except IndexError:
                            print(sentence)
                            continue
                    if (ARGV != 'V') & (ARGV != '_'):
                        tokendictk['ARG'],tokendictk['V'],tokendictk['pred'] = ARGV,'_','_'
                    if ARGV == '_':
                        tokendictk['V'],tokendictk['ARG'],tokendictk['pred'] = '_','_','_'
                    sents[k].append(tokendictk)


        res = [ele for ele in sents if ele != []] # remove empty list
        sentlist += res

    return sentlist

In [7]:
preprocessed_train = preprocess_list(trainlist)
preprocessed_dev = preprocess_list(devlist)
preprocessed_test = preprocess_list(testlist)

[['1', 'I', 'I', 'PRON', 'PRP', 'Case=Nom|Number=Sing|Person=1|PronType=Prs', '2', 'nsubj', '2:nsubj|9.1:nsubj|10:nsubj', '_', '_', 'ARG0', '_', '_'], ['2', 'wish', 'wish', 'VERB', 'VBP', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '0', 'root', '0:root', '_', 'wish.01', 'V', '_', '_'], ['3', 'all', 'all', 'DET', 'DT', '_', '2', 'iobj', '2:iobj', '_', '_', 'ARG2', '_', '_'], ['4', 'happy', 'happy', 'ADJ', 'JJ', 'Degree=Pos', '5', 'amod', '5:amod', '_', '_', '_', 'ARGM-ADJ', '_'], ['5', 'holidays', 'holiday', 'NOUN', 'NNS', 'Number=Plur', '2', 'obj', '2:obj', 'SpaceAfter=No', 'holiday.01', 'ARG1', 'V', '_'], ['6', ',', ',', 'PUNCT', ',', '_', '10', 'punct', '9.1:punct|10:punct', '_', '_', '_', '_', '_'], ['7', 'and', 'and', 'CCONJ', 'CC', '_', '10', 'cc', '9.1:cc|10:cc', '_', '_', '_', '_', '_'], ['8', 'moreso', 'moreso', 'ADV', 'RB', '_', '10', 'orphan', '9.1:advmod', 'SpaceAfter=No', '_', '_', '_', '_'], ['9', ',', ',', 'PUNCT', ',', '_', '10', 'punct', '9.1:punct|10:punct', '_', '_', '_', '_'

In [18]:
preprocessed_test[4]

[{'ID': '1',
  'form': '(',
  'lemma': '(',
  'upos': 'PUNCT',
  'xpos': '-LRB-',
  'feats': '_',
  'head': '14',
  'deprel': 'punct',
  'deps': '14:punct',
  'misc': 'SpaceAfter=No',
  'pred': '_',
  'dup': 1,
  'V': '_',
  'ARG': '_'},
 {'ID': '2',
  'form': 'And',
  'lemma': 'and',
  'upos': 'CCONJ',
  'xpos': 'CC',
  'feats': '_',
  'head': '14',
  'deprel': 'cc',
  'deps': '14:cc',
  'misc': 'SpaceAfter=No',
  'pred': '_',
  'dup': 1,
  'V': '_',
  'ARG': '_'},
 {'ID': '3',
  'form': ',',
  'lemma': ',',
  'upos': 'PUNCT',
  'xpos': ',',
  'feats': '_',
  'head': '14',
  'deprel': 'punct',
  'deps': '14:punct',
  'misc': '_',
  'pred': '_',
  'dup': 1,
  'V': '_',
  'ARG': '_'},
 {'ID': '4',
  'form': 'by',
  'lemma': 'by',
  'upos': 'ADP',
  'xpos': 'IN',
  'feats': '_',
  'head': '6',
  'deprel': 'case',
  'deps': '6:case',
  'misc': '_',
  'pred': '_',
  'dup': 1,
  'V': '_',
  'ARG': '_'},
 {'ID': '5',
  'form': 'the',
  'lemma': 'the',
  'upos': 'DET',
  'xpos': 'DT',
  'feat

## TO BE IMPLEMENTED: GET OTHER FEATURE

In [8]:
def create_features(preplist):
    """
    This function creates extra features by dependency parsing, using a preprocessed list.
    """
    sent_with_feature = []
    
    for sentence in preplist:
        
        # Extract sentence text
        sentence_text = []
        for dict in sentence:
            sentence_text += dict['form']
            
        # IMPLEMENT HERE: Extract sentence features
        #feat1, feat2, ... = extract_parsing_features(sentence_text)

        # Add features back to dict
        newsent = []
        for dict in sentence:
            newdict = copy.deepcopy(dict) # Avoid changing the original file
            #newdict['feat1'], newdict['feat2'], ... = feat1, feat2
            newsent.append(newdict)

        sent_with_feature.append(newsent)

    return sent_with_feature

In [None]:
def extract_parsing_features():
    """ TO BE IMPLEMENTED   "

## Single classifier

### Extract training features and labels

In [9]:
def extract_feature_and_label(preplist):
    """
    This function extract features and label from extracted feature list of dicts.
    It will flattern list of sentences into list of tokens.
    """
    data = []
    targets = []
    flatlist = [x for xs in preplist for x in xs]
    for dict in flatlist:
        newdict = copy.deepcopy(dict)
        del newdict['ARG'] # Remove gold
        data.append(newdict)
        targets.append(dict['ARG'])

    return data, targets

In [10]:
training_features, gold_labels = extract_feature_and_label(preprocessed_train)

### Create single logreg

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

def create_log_classifier(train_features, train_targets, max_iter):
    logreg = LogisticRegression(max_iter=max_iter)
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(train_features)
    model = logreg.fit(features_vectorized, train_targets) 
    return model, vec

In [12]:
model_single, vec_single = create_log_classifier(training_features, gold_labels, 100)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Predict with single logreg

In [15]:
def classify_data(model, vec, features):  
    features = vec.transform(features)
    predictions = model.predict(features)
    return predictions

In [16]:
using_test_set, test_gold = extract_feature_and_label(preprocessed_test)
single_predictions = classify_data(model_single, vec_single, using_test_set)

### Write output

In [17]:
def write_output_file(predictions, training_features, gold_labels, outputfile):
    outfile = open(outputfile, 'w')
    # add headings
    outfile.write('word' + '\t' + 'gold' + '\t' + 'predict' + '\n')
    for i in range(len(predictions)):
        outfile.write(training_features[i]['form'] + '\t' + gold_labels[i] + '\t' + predictions[i] + '\n')
    outfile.close()

In [18]:
outputpath = 'output/singlelogreg.csv'
write_output_file(single_predictions, training_features, gold_labels, outputpath)

## Double classifier
First classify is_ARG, then classify ARG_type

In [19]:
def extract_is_ARG_feature_and_label(preplist):
    """
    This function extract features and label from preprocessed list
    """
    data = []
    targets = []
    flatlist = [x for xs in preplist for x in xs]
    for dict in flatlist:
        newdict = copy.deepcopy(dict)
        del newdict['ARG'] # Remove gold
        data.append(newdict)
        
        if dict['ARG'] != '_':
            targets.append(True)
        else:
            targets.append(False)

    return data, targets

In [20]:
training_features_step1, gold_labels_step1 = extract_is_ARG_feature_and_label(preprocessed_train)

In [22]:
model_double_1, vec_double_1 = create_log_classifier(training_features_step1, gold_labels_step1, 100)

using_test_set_1, test_gold_1 = extract_is_ARG_feature_and_label(preprocessed_test)
predictions_1 = classify_data(model_double_1, vec_double_1, using_test_set_1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
def extract_ARG_type_feature_and_label(preplist):
    """
    This function extract ARG_type feature from the training set.
    """

    data = []
    targets = []
    flatlist = [x for xs in preplist for x in xs]
    
    for dict in flatlist:
        newdict = copy.deepcopy(dict)
        del newdict['ARG'] # Remove gold
        if dict['ARG'] != '_':
            newdict['is_ARG'] = 'True'
        else:
            newdict['is_ARG'] = 'False'
        
        data.append(newdict)
        targets.append(dict['ARG'])

    return data, targets

In [47]:
training_features_step2, gold_labels_step2 = extract_ARG_type_feature_and_label(preprocessed_train)

In [48]:
def extract_ARG_type_feature_and_label_with_prediction(preplist, predictions_1):
    """
    This function add result from the first classifier to the feature list for the test sets.
    """

    data = []
    targets = []
    flatlist = [x for xs in preplist for x in xs]
    
    for dict, predictions in zip(flatlist, predictions_1):
        newdict = copy.deepcopy(dict)
        del newdict['ARG'] # Remove gold
        newdict['is_ARG'] = str(predictions)
        
        data.append(newdict)
        targets.append(dict['ARG'])

    return data, targets

In [49]:
model_double_2, vec_double_2 = create_log_classifier(training_features_step2, gold_labels_step2, 100)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
using_test_set_2, test_gold_2 = extract_ARG_type_feature_and_label_with_prediction(preprocessed_test, predictions_1)

predictions_2 = classify_data(model_double_2, vec_double_2, using_test_set_2)

In [51]:
outputpath = 'output/doublelogreg.csv'
write_output_file(predictions_2, training_features_step2, gold_labels_step2, outputpath)

## GPU implementation

In [None]:
import cudf
import numpy as np
from cuml import LogisticRegression