In [1]:
import copy

In [2]:
trainfile = 'data/en_ewt-up-train.conllu'
testfile = 'data/en_ewt-up-test.conllu'

In [3]:
def read_conll(conllfile):
    """
    This function read and process the conllu file into list of sentences lists.
    """
    with open(conllfile, 'r', encoding='utf8') as infile:
        fulllist, sentlist = [],[]
        for line in infile:
            line = line.strip()
            if (line != '\n') & (line.startswith("#") == False): # Not empty and not commented
                sentlist.append(line.split())
            if line.startswith("#") == True:
                sentlist = [i for i in sentlist if i] # Remove empty list
                fulllist.append(sentlist)
                sentlist = []
                continue
        res = [ele for ele in fulllist if ele != []] # remove empty list
    return res

In [4]:
trainlist = read_conll(trainfile)
testlist = read_conll(testfile)

In [5]:
def preprocess_list(conlllist):
    """
    This function preprocess the lists into list of sentences list.
    Each sentence list is a list of token lists. Each token list have 13 columns.
    If a sentence have 0 predicates, the column (list item) 12 and 13 (list[11] and list[12]) are set as None.
    If the sentence have multiple predicates, it will be duplicated to align the column number.
    """
    sentlist = []
    for sentence in conlllist:
        
        sents = [ [] for _ in range(50) ]
                
        
        for x in range(len(sentence)): # for components in sentence brings duplicate removal error
            components = []
            for y in range(len(sentence[x])):
                components.append(str(sentence[x][y]))

            # First 11 lines
            for i in range(0,10):
                try:
                    tokendict = {"ID":components[0], "form":components[1], "lemma":components[2], "upos":components[3], "xpos":components[4], "feats":components[5], "head":components[6], 
                             "deprel":components[7], "deps":components[8], "misc":components[9], "pred":components[10]}
                except IndexError:
                    tokendict['pred'] = '_'

            # If sentence have no predicate: assign the values '_'
            if len(components) <= 11: 
                tokendict['V'] = '_'
                tokendict['ARG'] = '_'
                tokendict['dup'] = '_'
                sents[0].append(tokendict)

            # Sentence have one or more predicate
            if len(components) > 11: 
                dup = len(components)-11     
                for k in range(0, dup):
                    tokendictk = copy.deepcopy(tokendict)
                    tokendictk['dup'] = k
                    ARGV = components[k+11]
                    if ARGV == 'V':
                        tokendictk['V'],tokendictk['ARG'] = 'V','_'
                        try:
                            tokendictk['pred'] = sentence[int(tokendictk['ID'])-1][10]
                        except IndexError:
                            print(sentence)
                            continue
                    if (ARGV != 'V') & (ARGV != '_'):
                        tokendictk['ARG'],tokendictk['V'],tokendictk['pred'] = ARGV,'_','_'
                    if ARGV == '_':
                        tokendictk['V'],tokendictk['ARG'],tokendictk['pred'] = '_','_','_'
                    sents[k].append(tokendictk)


        res = [ele for ele in sents if ele != []] # remove empty list
        sentlist += res

    return sentlist

In [6]:
preprocessed_train = preprocess_list(trainlist)
preprocessed_test = preprocess_list(testlist)

[['1', 'I', 'I', 'PRON', 'PRP', 'Case=Nom|Number=Sing|Person=1|PronType=Prs', '2', 'nsubj', '2:nsubj|9.1:nsubj|10:nsubj', '_', '_', 'ARG0', '_', '_'], ['2', 'wish', 'wish', 'VERB', 'VBP', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '0', 'root', '0:root', '_', 'wish.01', 'V', '_', '_'], ['3', 'all', 'all', 'DET', 'DT', '_', '2', 'iobj', '2:iobj', '_', '_', 'ARG2', '_', '_'], ['4', 'happy', 'happy', 'ADJ', 'JJ', 'Degree=Pos', '5', 'amod', '5:amod', '_', '_', '_', 'ARGM-ADJ', '_'], ['5', 'holidays', 'holiday', 'NOUN', 'NNS', 'Number=Plur', '2', 'obj', '2:obj', 'SpaceAfter=No', 'holiday.01', 'ARG1', 'V', '_'], ['6', ',', ',', 'PUNCT', ',', '_', '10', 'punct', '9.1:punct|10:punct', '_', '_', '_', '_', '_'], ['7', 'and', 'and', 'CCONJ', 'CC', '_', '10', 'cc', '9.1:cc|10:cc', '_', '_', '_', '_', '_'], ['8', 'moreso', 'moreso', 'ADV', 'RB', '_', '10', 'orphan', '9.1:advmod', 'SpaceAfter=No', '_', '_', '_', '_'], ['9', ',', ',', 'PUNCT', ',', '_', '10', 'punct', '9.1:punct|10:punct', '_', '_', '_', '_'

In [7]:
preprocessed_test[0]

[{'ID': '1',
  'form': 'What',
  'lemma': 'what',
  'upos': 'PRON',
  'xpos': 'WP',
  'feats': 'PronType=Int',
  'head': '0',
  'deprel': 'root',
  'deps': '0:root',
  'misc': '_',
  'pred': '_',
  'dup': 0,
  'V': '_',
  'ARG': '_'},
 {'ID': '2',
  'form': 'if',
  'lemma': 'if',
  'upos': 'SCONJ',
  'xpos': 'IN',
  'feats': '_',
  'head': '4',
  'deprel': 'mark',
  'deps': '4:mark',
  'misc': '_',
  'pred': '_',
  'dup': 0,
  'V': '_',
  'ARG': '_'},
 {'ID': '3',
  'form': 'Google',
  'lemma': 'Google',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'feats': 'Number=Sing',
  'head': '4',
  'deprel': 'nsubj',
  'deps': '4:nsubj',
  'misc': '_',
  'pred': '_',
  'dup': 0,
  'ARG': 'ARG1',
  'V': '_'},
 {'ID': '4',
  'form': 'Morphed',
  'lemma': 'morph',
  'upos': 'VERB',
  'xpos': 'VBD',
  'feats': 'Mood=Ind|Tense=Past|VerbForm=Fin',
  'head': '1',
  'deprel': 'advcl',
  'deps': '1:advcl:if',
  'misc': '_',
  'pred': 'morph.01',
  'dup': 0,
  'V': 'V',
  'ARG': '_'},
 {'ID': '5',
  'form': 'Int

In [8]:
def extract_feature_and_label(preplist):
    """
    This function extract features and label from preprocessed list
    """
    data = []
    targets = []
    flatlist = [x for xs in preplist for x in xs]
    for dict in flatlist:
        newdict = copy.deepcopy(dict)
        del newdict['ARG']
        data.append(newdict)
        targets.append(dict['ARG'])

    return data, targets

In [9]:
training_features, gold_labels = extract_feature_and_label(preprocessed_train)

In [10]:
training_features[1]

{'ID': '2',
 'form': '-',
 'lemma': '-',
 'upos': 'PUNCT',
 'xpos': 'HYPH',
 'feats': '_',
 'head': '1',
 'deprel': 'punct',
 'deps': '1:punct',
 'misc': 'SpaceAfter=No',
 'pred': '_',
 'dup': 0,
 'V': '_'}

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

def create_log_classifier(train_features, train_targets):
    logreg = LogisticRegression(max_iter=1000)
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(train_features)
    model = logreg.fit(features_vectorized, train_targets) 
    return model, vec

def classify_data(model, vec, inputdata):  
    features = extract_feature_and_label(inputdata)[0]
    features = vec.transform(features)
    predictions = model.predict(features)
    return predictions

def write_output_file(predictions, training_features, gold_labels, outputfile):
    outfile = open(outputfile, 'w')
    # add headings
    outfile.write('word' + '\t' + 'gold' + '\t' + 'predict' + '\n')
    for i in range(len(predictions)):
        outfile.write(training_features[i]['form'] + '\t' + gold_labels[i] + '\t' + predictions[i] + '\n')
    outfile.close()

def logreg(inputdict, outputfile, training_features=training_features, gold_labels=gold_labels):
    ml_model, vec = create_log_classifier(training_features, gold_labels)
    predictions = classify_data(ml_model, vec, inputdict)
    write_output_file(predictions, inputdict, outputfile)

logreg(preprocessed_test ,'output/logreg.csv')