# NLU: Mid-Term Assignment 2022
### Description
In this notebook, we ask you to complete four main tasks to show what you have learnt during the NLU labs. Therefore, to complete the assignment please refer to the concepts, libraries and other materials shown and used during the labs. The last task is not mandatory, it is a *BONUS* to get an extra mark for the laude. 

### Instructions
- **Dataset**: in this notebook, you are asked to work with the dataset *Conll 2003* provided by us in the *data* folder. Please, load the files from the *data* folder and **do not** change names or paths of the inner files. 
- **Output**: for each part of your task, print your results and leave it in the notebook. Please, **do not** send a jupyter notebook without the printed outputs.
- **Other**: follow carefully all the further instructions and suggestions given in the question descriptions.

### Deadline
The deadline is due in two weeks from the project presentation. Please, refer to *piazza* channel for the exact date.

## Setup

In [16]:
from nltk import FreqDist
from nltk.lm import Vocabulary
from nltk.corpus import ConllCorpusReader

CORPUS_ROOT = 'data'
CORPUS_FILEIDS = ['train.txt', 'test.txt', 'valid.txt']
CORPUS_COLUMNTYPES = ['words', 'ne', 'pos', 'chunk', 'tree']

corpus = ConllCorpusReader(CORPUS_ROOT, CORPUS_FILEIDS, CORPUS_COLUMNTYPES)
corpus_train = ConllCorpusReader(CORPUS_ROOT, CORPUS_FILEIDS[0], CORPUS_COLUMNTYPES)
corpus_test = ConllCorpusReader(CORPUS_ROOT, CORPUS_FILEIDS[1], CORPUS_COLUMNTYPES)
corpus_val = ConllCorpusReader(CORPUS_ROOT, CORPUS_FILEIDS[2], CORPUS_COLUMNTYPES)

# Removing DOCSTART and other empty lists
corpus_sents = [s for s in corpus.sents() if s != []]
corpus_train_sents = [s for s in corpus_train.sents() if s != []]
corpus_test_sents = [s for s in corpus_test.sents() if s != []]
corpus_val_sents = [s for s in corpus_val.sents() if s != []]

import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = Tokenizer(nlp.vocab)

from sklearn.metrics import classification_report

import os
import sys
sys.path.insert(0, os.path.abspath('./src/'))
from conll import evaluate

import pandas as pd

# Utilities
def nbest(d, n=1):
    """
    get n max values from a dict
    :param d: input dict (values are numbers, keys are stings)
    :param n: number of values to get (int)
    :return: dict of top n key-value pairs
    """
    return dict(sorted(d.items(), key=lambda item: item[1], reverse=True)[:n])

def get_flat_sents(corpus):
    sents = list()
    for sent in corpus:
        flat_sent = ""
        for w in sent:
            flat_sent += f"{w} "
        sents.append(flat_sent.strip())
    return sents

## Task 1: Analysis of the dataset

### Q 1.1
- Create the Vocabulary and Frequency Dictionary of the:
    1. Whole dataset
    2. Train set
    3. Test set
    
**Attention**: print the first 20 words of the Dictionaty of each set

In [18]:
def q11():
    # Create vocabulary
    vocab = set([w.lower() for w in corpus.words()])
    vocab_train = set([w.lower() for w in corpus_train.words()])
    vocab_test = set([w.lower() for w in corpus_test.words()])

    # Create frequency distribution
    fd = FreqDist([w.lower() for w in corpus.words()])
    fd_train = FreqDist([w.lower() for w in corpus_train.words()])
    fd_test = FreqDist([w.lower() for w in corpus_test.words()])

    # Print vocabulary length
    print("Length of whole dataset: %d" % len(vocab))
    print("Length of train set: %d" % len(vocab_train))
    print("Length of test set: %d" % len(vocab_test))

    # Print the first 20 words for each dict
    print("\nFirst 20 words of whole dataset:")
    print(nbest(fd, 20))
    print("\nFirst 20 words of train set:")
    print(nbest(fd_train, 20))
    print("\nFirst 20 words of test set:")
    print(nbest(fd_test, 20))

q11()

Length of whole dataset: 26869
Length of train set: 21009
Length of test set: 8548

First 20 words of whole dataset:
{'the': 12310, ',': 10876, '.': 10874, 'of': 5502, 'in': 5405, 'to': 5129, 'a': 4731, '(': 4226, ')': 4225, 'and': 4223, '"': 3239, 'on': 3115, 'said': 2694, "'s": 2339, 'for': 2109, '-': 1866, '1': 1845, 'at': 1679, 'was': 1593, '2': 1342}

First 20 words of train set:
{'the': 8390, '.': 7374, ',': 7290, 'of': 3815, 'in': 3621, 'to': 3424, 'a': 3199, 'and': 2872, '(': 2861, ')': 2861, '"': 2178, 'on': 2092, 'said': 1849, "'s": 1566, 'for': 1465, '1': 1421, '-': 1243, 'at': 1146, 'was': 1095, '2': 973}

First 20 words of test set:
{'the': 1765, ',': 1637, '.': 1626, 'to': 805, 'of': 789, 'in': 761, '(': 686, ')': 684, 'a': 658, 'and': 598, 'on': 467, '"': 421, 'said': 399, "'s": 347, '-': 287, 'for': 286, 'at': 251, 'was': 224, '4': 201, 'with': 185}


### Q 1.2
- Obtain the list of:
    1. Out-Of-Vocabulary (OOV) tokens
    2. Overlapping tokens between train and test sets  

In [127]:
def q12(cutoff=1):


    test_lower = [w.lower() for w in corpus_test.words()]
    val_lower = [w.lower() for w in corpus_val.words()]
    # Get vocabs
    vocab_train = Vocabulary([w.lower() for w in corpus_train.words()], unk_cutoff=cutoff)
    vocab_test = Vocabulary(test_lower, unk_cutoff=cutoff)
    vocab_valid = Vocabulary(val_lower, unk_cutoff=cutoff)
    vocab_tv = Vocabulary([*test_lower, *val_lower], unk_cutoff=cutoff)

    # Get list of tokens
    tokens_train = set(vocab_train.counts.keys())
    tokens_test = set(vocab_test.counts.keys())
    tokens_val = set(vocab_valid.counts.keys())
    tokens_tv = set(vocab_tv.counts.keys())

    # Get OOV 
    oov_test = tokens_test.difference(tokens_train)
    oov_valid = tokens_val.difference(tokens_train)
    oov_tv = tokens_val.difference(tokens_tv)
    print("[Q1.2.1]\n>\tOOV tokens:")
    print(">\t (test) Found {} OOV".format(len(oov_test)))
    print(">\t (valid) Found {} OOV".format(len(oov_valid)))
    print(">\t (test + valid) Found {} OOV".format(len(oov_tv)))

    print()

    # Get overlapping tokens w/ test set
    intersection_test = tokens_train.intersection(tokens_test)
    intersection_val = tokens_train.intersection(tokens_val)
    intersection_tv = tokens_train.intersection(tokens_tv)
    print("[Q1.2.1]\n>\tOverlapping tokens:")
    print(">\t (test) Found {} overlapping tokens".format(len(intersection_test)))
    print(">\t (valid) Found {} overlapping tokens".format(len(intersection_val)))
    print(">\t (test + val) Found {} overlapping tokens".format(len(intersection_tv)))
    

q12()

[Q1.2.1]
>	OOV tokens:
>	 (test) Found 3268 OOV
>	 (valid) Found 2856 OOV
>	 (test + valid) Found 0 OOV

[Q1.2.1]
>	Overlapping tokens:
>	 (test) Found 5280 overlapping tokens
>	 (valid) Found 6146 overlapping tokens
>	 (test + val) Found 8066 overlapping tokens


### Q 1.3
- Perform a complete data analysis of the whole dataset (train + test sets) to obtain:
    1. Average sentence length computed in number of tokens
    2. The 50 most-common tokens
    3. Number of sentences

In [18]:
def q13():

    # Get average sentence length
    print("[Q1.3.1]\n>\tAverage sentence length in tokens: {:.4f}\n".format(len(corpus.words())/len(corpus_sents)))

    # Get 50 most common tokens
    vocab = Vocabulary([w.lower() for w in corpus.words()])
    most_common_tokens = nbest(vocab.counts, 50)
    print("[Q1.3.2]\n>\t50 most common tokens:")
    # print(">\t", most_common_tokens)
    count = 1
    for key in most_common_tokens:
        print(">\t[{}] {}: {}".format(count, key, most_common_tokens[key]))
        count += 1

    # Get number of sentences
    print("\n[Q1.3.3]\n>\tNumber of sentences: %d" % len(corpus_sents))

q13()

[Q1.3.1]
>	Average sentence length in tokens: 14.5304

[Q1.3.2]
>	50 most common tokens:
>	[1] the: 12310
>	[2] ,: 10876
>	[3] .: 10874
>	[4] of: 5502
>	[5] in: 5405
>	[6] to: 5129
>	[7] a: 4731
>	[8] (: 4226
>	[9] ): 4225
>	[10] and: 4223
>	[11] ": 3239
>	[12] on: 3115
>	[13] said: 2694
>	[14] 's: 2339
>	[15] for: 2109
>	[16] -: 1866
>	[17] 1: 1845
>	[18] at: 1679
>	[19] was: 1593
>	[20] 2: 1342
>	[21] with: 1267
>	[22] 3: 1264
>	[23] 0: 1232
>	[24] that: 1212
>	[25] he: 1166
>	[26] from: 1146
>	[27] by: 1113
>	[28] it: 1082
>	[29] :: 1057
>	[30] is: 984
>	[31] 4: 973
>	[32] as: 920
>	[33] his: 867
>	[34] had: 841
>	[35] were: 804
>	[36] an: 796
>	[37] but: 786
>	[38] not: 786
>	[39] after: 780
>	[40] has: 768
>	[41] be: 754
>	[42] have: 738
>	[43] new: 656
>	[44] first: 645
>	[45] who: 643
>	[46] 5: 636
>	[47] will: 591
>	[48] 6: 584
>	[49] two: 579
>	[50] they: 567

[Q1.3.3]
>	Number of sentences: 20744


### Q 1.4
- Create the dictionary of Named Entities and their Frequencies for the:
    1. Whole dataset
    2. Train set
    3. Test set

In [13]:
def q14():
    WORD, _, NE = range(3)

    def merge_iob_tags(doc):
        idx = 0
        merged_ne = list()
        for idx in range(len(doc)):
            if doc[idx][NE].split('-')[0] == "B":
                temp = str(doc[idx][WORD])
                idx += 1
                while idx < len(doc) and doc[idx][NE].split('-')[0] == "I":
                    temp += " %s" % str(doc[idx][WORD])
                    idx += 1
                merged_ne.append(temp)
        return merged_ne

    # Whole dataset
    iob_all = [(w[WORD], _, w[NE]) for w in corpus.iob_words() if w[NE] != 'O']
    ne_all = merge_iob_tags(iob_all)
    fd_all = FreqDist(ne_all)
    print("[Q1.4.1]\n>\tFrequency dist of Named Entities for the whole dataset\n>\t", nbest(fd_all, 20))

    # Train set
    iob_train = [(w[WORD], _, w[NE]) for w in corpus_train.iob_words() if w[NE] != 'O']
    ne_train = merge_iob_tags(iob_train)
    fd_train = FreqDist(ne_train)
    print("[Q1.4.2]\n>\tFrequency dist of Named Entities for the training set\n>\t", nbest(fd_train, 20))

    # Test set
    iob_test = [(w[WORD], _, w[NE]) for w in corpus_test.iob_words() if w[NE] != 'O']
    ne_test = merge_iob_tags(iob_test)
    fd_test = FreqDist(ne_test)
    print("[Q1.4.3]\n>\tFrequency dist of Named Entities for the test set\n>\t", nbest(fd_test, 20))

q14()

[Q1.4.1]
>	Frequency dist of Named Entities for the whole dataset
>	 {'U.S.': 460, 'Germany': 237, 'Australia': 204, 'France': 199, 'England': 176, 'Russia': 167, 'Britain': 165, 'Italy': 160, 'China': 149, 'LONDON': 147, 'Spain': 145, 'NEW YORK': 143, 'Japan': 133, 'Russian': 120, 'German': 114, 'Reuters': 114, 'Israel': 108, 'Sweden': 108, 'Pakistan': 103, 'Iraq': 98}
[Q1.4.2]
>	Frequency dist of Named Entities for the training set
>	 {'U.S.': 303, 'Germany': 141, 'Britain': 133, 'Australia': 130, 'England': 123, 'France': 122, 'Spain': 110, 'Italy': 98, 'NEW YORK': 95, 'LONDON': 93, 'Russian': 92, 'China': 91, 'Russia': 88, 'Japan': 87, 'Pakistan': 85, 'Sweden': 81, 'German': 80, 'British': 73, 'Reuters': 73, 'Belgium': 71}
[Q1.4.3]
>	Frequency dist of Named Entities for the test set
>	 {'Germany': 49, 'U.S.': 45, 'Australia': 45, 'Japan': 41, 'Italy': 41, 'France': 40, 'World Cup': 34, 'Russia': 34, 'Indonesia': 33, 'China': 32, 'LONDON': 31, 'Austria': 29, 'Barcelona': 24, 'Canada

## Task 2: Working with Dependecy Tree
*Suggestions: use Spacy pipeline to retreive the Dependecy Tree*


### Q 2.1
- Given each sentence in the dataset, write the required functions to provide:
    1. Subject, obects (direct and indirect)
    2. Noun chunks
    3. The head noun in each noun chunk
    
**Attention**: *print only the results of these functions by using the sentence "I saw the man with a telescope"*

In [20]:
def q21(corpus):

    def get_subj_obj_dict(doc):
        deps_dict = dict()
        deps = ['nsubj', 'dobj', 'pobj']
        for dep in deps:
            deps_dict[dep] = list()
        for token in doc:
            if token.dep_ in deps:
                deps_dict[token.dep_].append(token.text)
        return deps_dict
                    
    def get_noun_chunks(doc):
        return doc.noun_chunks

    def get_head_of_chunk(doc):
        return [(c.root.text, c.text) for c in doc.noun_chunks]


    def q211(doc):
        print("[Q2.1.1]\n>\tProviding subjects and objects:")
        deps_dict = get_subj_obj_dict(doc)
        for key in deps_dict:
            print(">\t {}: {}".format(key, deps_dict[key]))
        print()


    def q212(doc):
        print("[Q2.1.2]\n>\tProviding noun chunks:")
        noun_chunks = get_noun_chunks(doc)
        for chunk in noun_chunks:
            print(">\t", chunk)
        print()

    def q213(doc):
        print("[Q2.1.3]\n>\tProviding head noun for each noun chunk:")
        print(">\t'CHUNK' -> HEAD\n>")
        heads = get_head_of_chunk(doc)
        for head, chunk in heads:
            print(">\t'{}' -> {}".format(chunk, head))
        print()


    sents = get_flat_sents(corpus)
    for sent in sents[:10]:
        doc = nlp(sent)
        print (sent, "\n")
        q211(doc)
        q212(doc)
        q213(doc)

    # doc = nlp(sents[14])
    # print (sents[14], "\n")
    # q211(doc)
    # q212(doc)
    # q213(doc)


q21(["I saw the man with a telescope".split(" ")])
# q21(corpus_sents)

I saw the man with a telescope 

[Q2.1.1]
>	Providing subjects and objects:
>	 nsubj: ['I']
>	 dobj: ['man']
>	 pobj: ['telescope']

[Q2.1.2]
>	Providing noun chunks:
>	 I
>	 the man
>	 a telescope

[Q2.1.3]
>	Providing head noun for each noun chunk:
>	'CHUNK' -> HEAD
>
>	'I' -> I
>	'the man' -> man
>	'a telescope' -> telescope



### Q 2.2
- Given a dependecy tree of a sentence and a segment of that sentence write the required functions that ouput the dependency subtree of that segment.

**Attention**: *print only the results of these functions by using the sentence "I saw the man with a telescope" (the segment could be any e.g. "saw the man", "a telescope", etc.)*

In [24]:
def q22(corpus):

    def get_root(doc):
        for token in doc:
            if token.dep_ == 'ROOT':
                return token.text 
            else:
                continue

    def get_subtree(chunk_text, doc):
        subtree = None
        chunk_doc = nlp(chunk_text)
        chunk_root = get_root(chunk_doc)
        for token in doc:
            if token.text == chunk_root:
                leftmost = list(token.subtree)[0].i
                rightmost = list(token.subtree)[-1].i
                subtree = doc[leftmost:rightmost+1]
        return subtree


    sents = get_flat_sents(corpus)
    # sents = [sents[14]]
    for sent in sents:
        print(sent)
        doc = nlp(sent)
        spacy.displacy.render(doc, style="dep")
        for chunk in doc.noun_chunks:
            # subtree = get_subtree(chunk.text, doc)
            # print("Chunk: {}\n>\tSubtree: {}".format(chunk, chunk))
            print("Chunk: {}".format(chunk))
            spacy.displacy.render(chunk, style="dep")
    
q22([
    "I saw the man with the telescope".split(" "),
    # "I saw the man with a telescope".split(" ")
])
# q22(corpus_sents)

I saw the man with the telescope


Chunk: I


Chunk: the man


Chunk: the telescope


### Q 2.3
- Given a token in a sentence, write the required functions that output the dependency path from the root of the dependency tree to that given token.

**Attention**: *print only the results of these functions by using the sentence "I saw the man with a telescope"*

In [36]:
def q23(corpus: list, cut=5):
        # print()
    def compute_dependency_path(token):
        path = list()
        path.append(token.text)
        while token.dep_ != 'ROOT':
            token = token.head
            path.append(token.text)
        path.reverse()
        return path

    if len(corpus) >= cut:
        corpus = corpus[:cut]
    sents = get_flat_sents(corpus)
    for sent in sents:
        print(sent)
        doc = nlp(sent)
        print("TOKEN ---> ['path', 'to', 'token']\n")
        for token in doc:
            print("{}\n\t---> {}".format(token.text, compute_dependency_path(token)))

    spacy.displacy.render(doc, style="dep")

q23(["I saw the man with a telescope".split(" ")], 10)
# q23(corpus_sents)

I saw the man with a telescope
TOKEN ---> ['path', 'to', 'token']

I
	---> ['saw', 'I']
saw
	---> ['saw']
the
	---> ['saw', 'man', 'the']
man
	---> ['saw', 'man']
with
	---> ['saw', 'with']
a
	---> ['saw', 'with', 'telescope', 'a']
telescope
	---> ['saw', 'with', 'telescope']


## Task 3: Named Entity Recognition
*Suggestion: use scikit-learn metric functions. See classification_report*

### Q 3.1
- Benchmark Spacy Named Entity Recognition model on the test set by:
    1. Providing the list of categories in the dataset (person, organization, etc.)
    2. Computing the overall accuracy on NER
    3. Computing the performance of the Named Entity Recognition model for each category:
        - Compute the perfomance at the token level (eg. B-Person, I-Person, B-Organization, I-Organization, O, etc.)
        - Compute the performance at the entity level (eg. Person, Organization, etc.)

In [39]:
# Getting processed dataset
sents_train = get_flat_sents(corpus_train.sents())
docs_train = [nlp(sent) for sent in sents_train]

In [54]:
sents_test = get_flat_sents(corpus_test.sents())
docs_test = [nlp(sent) for sent in sents_test]

In [24]:
sents_all = get_flat_sents(corpus.sents())
docs_all = [nlp(sent) for sent in sents_all]

#### Q 3.1.1

In [97]:
from itertools import chain

def get_categories(corpus: ConllCorpusReader):
    categories = set()
    for token in chain.from_iterable(corpus._grids()):
        ent_type = token[3]
        if ent_type != 'O':
            categories.add(ent_type.split('-')[1])
    return list(categories)

def get_categories_spacy(docs):
    categories = set()
    for doc in docs:
        # doc = nlp(sent)
        for token in doc:
            categories.add(token.ent_type_)
        if '' in categories:
            categories.discard('')
    return list(categories)

def q311(corpus: ConllCorpusReader, docs: spacy):
    print("[Q3.1.1]\n>\tProviding list of categories in the dataset:")
    print(">\t Original dataset:", end="\n>\t\t")
    categories = get_categories(corpus)
    for category in categories:
        print(" {}".format(category), end="")
    print("\n>\t After Spacy processing:", end="\n>\t\t")
    categories = get_categories_spacy(docs)
    for category in categories:
        print(" {}".format(category), end="")
    print()

In [98]:
q311(corpus_test, docs_test)

[Q3.1.1]
>	Providing list of categories in the dataset:
>	 Original dataset:
>		 PER MISC LOC ORG
>	 After Spacy processing:
>		 QUANTITY LOC DATE EVENT PERSON LAW WORK_OF_ART PRODUCT ORDINAL CARDINAL NORP ORG GPE FAC PERCENT MONEY LANGUAGE TIME


#### Q 3.1.2

In [130]:
from itertools import chain

def map_spacy_ents(old_spacy_ner):
    allowed = ["LOC", "ORG", "O"]
    mapping_dict = {
        "GPE": "LOC",
        "PERSON": "PER",
        "EVENT": "MISC",
        "NORP": "MISC",
    }
    spacy_ner = list()
    for sent in old_spacy_ner:
        sent_temp = list()
        for token in sent:
            is_otag = "-" not in token[1]
            ent_label = token[1].split("-")[1] if not is_otag else token[1]
            if ent_label not in allowed:
                if ent_label in mapping_dict.keys():
                    ent_label = mapping_dict[ent_label]
                else:
                    ent_label = "O"
                    is_otag = True
            if is_otag:
                sent_temp.append((token[0], ent_label))
            else:
                sent_temp.append((token[0], f"{token[1][:2]}{ent_label}"))
        spacy_ner.append(sent_temp)
    return spacy_ner

def convert_to_sk_cl_report(tuples_list):
    return [t[1] for t in tuples_list if t != []]

def q312(docs, corpus):
    WORD, _, NE = range(3)
    gt = list()
    for s in corpus.iob_sents():
        gt.append([(w[WORD], w[NE]) for w in s])
    spacy_ner = list()
    for doc in docs:
        current_sent = list()
        for token in doc:
            ent_type = token.ent_iob_
            if token.ent_type_ != '':
                ent_type += f"-{token.ent_type_}"
            current_sent.append((token.text, ent_type))
        spacy_ner.append(current_sent)


    print("[Q3.1.2]\n>\tProviding overall accuracy:", end="\n>\n")
    results_raw = classification_report(
        convert_to_sk_cl_report(chain.from_iterable(gt)), 
        convert_to_sk_cl_report(chain.from_iterable(spacy_ner)),
        zero_division=1, output_dict=True)
    print(">\t Raw results (no mapping): \n>\t\t%.4f" % results_raw["accuracy"])
    # results = evaluate(gt, spacy_ner)
    # pd_tbl = pd.DataFrame().from_dict(results, orient='index')
    # pd_tbl.round(decimals=3)
    # print(pd_tbl)

    results_mapping = classification_report(
        convert_to_sk_cl_report(chain.from_iterable(gt)), 
        convert_to_sk_cl_report(chain.from_iterable(map_spacy_ents(spacy_ner))),
        output_dict=True)
    print(">\t Mapping spacy entity labels to grount truth: \n>\t\t%.4f" % results_mapping["accuracy"])
    # # results = evaluate(gt, map_spacy_ents(spacy_ner))
    # # pd_tbl = pd.DataFrame().from_dict(results, orient='index')
    # # pd_tbl.round(decimals=3)
    # # print(pd_tbl)

In [131]:
q312(docs_test, corpus_test)

[Q3.1.2]
>	Providing overall accuracy:
>
>	 Raw results (no mapping): 
>		0.7114
>	 Mapping spacy entity labels to grount truth: 
>		0.9061


#### Q 3.1.3

In [35]:
def q313(docs):
    pass

In [36]:
q313(docs_train)

## Task 4: BONUS PART (extra mark for laude)

### Save old parser configuration

In [49]:
from nltk.parse.transitionparser import Configuration, TransitionParser
old_extract_features = Configuration.extract_features
old_train = TransitionParser.train

### Retrieve treebank dataset

In [50]:
import types
from nltk import download
download('dependency_treebank')
from nltk.corpus import dependency_treebank
from nltk.parse import DependencyEvaluator

# split the dataset into train and test
# first 100 as train dataset and last 10 as test dataset
train_dataset = dependency_treebank.parsed_sents()[:100]
test_dataset =  dependency_treebank.parsed_sents()[-10:]

[nltk_data] Downloading package dependency_treebank to
[nltk_data]     /home/pips/nltk_data...
[nltk_data]   Package dependency_treebank is already up-to-date!


### Compute performance of the original parser

In [53]:
Configuration.extract_features = old_extract_features
tp = TransitionParser('arc-standard')
tp.train(train_dataset, 'tp.model', verbose=False)

# parsing takes a list of dependency graphs and a model as arguments
parses = tp.parse(test_dataset, 'tp.model')

# evaluating the parser
de = DependencyEvaluator(parses, test_dataset)
las, uas = de.eval()

# no labels, thus identical
print("original labelled attachment score ",las)
print("original unlabelled attachment score",uas)

 Number of training examples : 100
 Number of valid (projective) examples : 100
original labelled attachment score  0.7791666666666667
original unlabelled attachment score 0.7791666666666667


### Q 4.1
- Modify NLTK Transition parser's Configuration calss to use better features.

In [5]:
from nltk.parse import DependencyGraph
def new_extract_features(self):
    """
    Extract the set of features for the current configuration. Implement standard features as describe in
    Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre.
    Please note that these features are very basic.
    :return: list(str)
    """
    result = []
    # print("n", end=" ")
    if len(self.stack) > 0:
        # Stack 0
        stack_idx0 = self.stack[len(self.stack) - 1]
        token = self._tokens[stack_idx0]
        # print (token)
        if self._check_informative(token["word"], True):
            result.append("STK_0_FORM_" + token["word"])
        if "lemma" in token and self._check_informative(token["lemma"]):
            result.append("STK_0_LEMMA_" + token["lemma"])
        if self._check_informative(token["tag"]):
            result.append("STK_0_POS_" + token["tag"])
        if self._check_informative(token["ctag"], True):
            result.append("STK_0_CTAG_" + token["ctag"])
        if self._check_informative(token["head"], True):
            result.append("STK_0_HEAD_" + str(token["head"]))
        if self._check_informative(token["rel"]):
            result.append("BUF_0_REL_" + token["rel"])
        if "feats" in token and self._check_informative(token["feats"]):
            feats = token["feats"].split("|")
            for feat in feats:
                result.append("STK_0_FEATS_" + feat)
        # Stack 1
        if len(self.stack) > 1:
            stack_idx1 = self.stack[len(self.stack) - 2]
            token = self._tokens[stack_idx1]
            if self._check_informative(token["tag"]):
                result.append("STK_1_POS_" + token["tag"])

        # Left most, right most dependency of stack[0]
        left_most = 1000000
        right_most = -1
        dep_left_most = ""
        dep_right_most = ""
        for (wi, r, wj) in self.arcs:
            if wi == stack_idx0:
                if (wj > wi) and (wj > right_most):
                    right_most = wj
                    dep_right_most = r
                if (wj < wi) and (wj < left_most):
                    left_most = wj
                    dep_left_most = r
        if self._check_informative(dep_left_most):
            result.append("STK_0_LDEP_" + dep_left_most)
        if self._check_informative(dep_right_most):
            result.append("STK_0_RDEP_" + dep_right_most)

    # Check Buffered 0
    if len(self.buffer) > 0:
        # Buffer 0
        buffer_idx0 = self.buffer[0]
        token = self._tokens[buffer_idx0]
        if self._check_informative(token["word"], True):
            result.append("BUF_0_FORM_" + token["word"])
        if "lemma" in token and self._check_informative(token["lemma"]):
            result.append("BUF_0_LEMMA_" + token["lemma"])
        if self._check_informative(token["tag"]):
            result.append("BUF_0_POS_" + token["tag"])
        if self._check_informative(token["ctag"]):
            result.append("BUF_0_CTAG_" + token["ctag"])
        if self._check_informative(token["head"]):
            result.append("BUF_0_HEAD_" + str(token["head"]))
        if self._check_informative(token["rel"]):
            result.append("BUF_0_REL_" + token["rel"])
        if "feats" in token and self._check_informative(token["feats"]):
            feats = token["feats"].split("|")
            for feat in feats:
                result.append("BUF_0_FEATS_" + feat)
        # Buffer 1
        if len(self.buffer) > 1:
            buffer_idx1 = self.buffer[1]
            token = self._tokens[buffer_idx1]
            ### DISCARDED
            # if self._check_informative(token["word"], True):
            #     result.append("BUF_1_FORM_" + token["word"])
            # if self._check_informative(token["head"]):
            #     result.append("BUF_1_HEAD_" + str(token["head"]))
            if self._check_informative(token["lemma"], True):
                result.append("BUF_1_LEMMA_" + token["lemma"])
            if self._check_informative(token["tag"]):
                result.append("BUF_1_POS_" + token["tag"])
        if len(self.buffer) > 2:
            buffer_idx2 = self.buffer[2]
            token = self._tokens[buffer_idx2]
            if self._check_informative(token["tag"]):
                result.append("BUF_2_POS_" + token["tag"])
        if len(self.buffer) > 3:
            buffer_idx3 = self.buffer[3]
            token = self._tokens[buffer_idx3]
            if self._check_informative(token["tag"]):
                result.append("BUF_3_POS_" + token["tag"])
        if len(self.buffer) > 4:
            buffer_idx4 = self.buffer[4]
            token = self._tokens[buffer_idx4]
            if self._check_informative(token["tag"]):
                result.append("BUF_4_POS_" + token["tag"])
                # Left most, right most dependency of stack[0]
        left_most = 1000000
        right_most = -1
        dep_left_most = ""
        dep_right_most = ""
        for (wi, r, wj) in self.arcs:
            if wi == buffer_idx0:
                if (wj > wi) and (wj > right_most):
                    right_most = wj
                    dep_right_most = r
                if (wj < wi) and (wj < left_most):
                    left_most = wj
                    dep_left_most = r
        if self._check_informative(dep_left_most):
            result.append("BUF_0_LDEP_" + dep_left_most)
        if self._check_informative(dep_right_most):
            result.append("BUF_0_RDEP_" + dep_right_most)

    return result

Configuration.extract_features = new_extract_features


gold_sent = DependencyGraph("""
Economic  JJ     2      ATT
news  NN     3       SBJ
has       VBD       0       ROOT
little      JJ      5       ATT
effect   NN     3       OBJ
on     IN      5       ATT
financial       JJ       8       ATT
markets    NNS      6       PC
.    .      3       PU
""")

# for s in gold_sent.triples():
#     print(s)

conf = Configuration(gold_sent)
# print(conf)
conf.extract_features()

['STK_0_POS_TOP',
 'STK_0_CTAG_TOP',
 'BUF_0_FORM_Economic',
 'BUF_0_LEMMA_Economic',
 'BUF_0_POS_JJ',
 'BUF_0_CTAG_JJ',
 'BUF_0_HEAD_2',
 'BUF_0_REL_ATT',
 'BUF_1_LEMMA_news',
 'BUF_1_POS_NN',
 'BUF_2_POS_VBD',
 'BUF_3_POS_JJ',
 'BUF_4_POS_NN']

### Q 4.2
- Evaluate the features comparing performance to the original.

In [9]:
Configuration.extract_features = new_extract_features
# using the TransitionParser 
tp_new_fe = TransitionParser('arc-standard')
# replacing the train function with the modified one
# tp_new_fe = types.MethodType(train,tp_new_fe)
tp_new_fe.train(train_dataset, 'tp_new_fe.model', verbose=False)
# print(tp)

# parsing takes a list of dependency graphs and a model as arguments
parses_new_fe = tp_new_fe.parse(test_dataset, 'tp_new_fe.model')
# print(len(parses_new_fe))
# print(parses_new_fe[0])

de_new_fe = DependencyEvaluator(parses_new_fe, test_dataset)
las_new_fe, uas_new_fe = de_new_fe.eval()

# no labels, thus identical
print('modified labelled attachment score',las_new_fe)
print('modified unlabelled attachment score',uas_new_fe)


 Number of training examples : 100
 Number of valid (projective) examples : 100
modified labelled attachment score 0.8
modified unlabelled attachment score 0.8


### Q 4.3
- Replace SVM classifier with an alternative of your choice.

In [65]:
import pickle
import tempfile
try:
    from sklearn.linear_model import SGDClassifier
    from sklearn.datasets import load_svmlight_file
except ImportError:
    pass

def sgd_train(self, depgraphs, modelfile, verbose=True):
    """
    :param depgraphs : list of DependencyGraph as the training data
    :type depgraphs : DependencyGraph
    :param modelfile : file name to save the trained model
    :type modelfile : str
    """

    try:
        input_file = tempfile.NamedTemporaryFile(
            prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
        )

        if self._algorithm == self.ARC_STANDARD:
            self._create_training_examples_arc_std(depgraphs, input_file)
        else:
            self._create_training_examples_arc_eager(depgraphs, input_file)

        input_file.close()
        # Using the temporary file to train the libsvm classifier
        x_train, y_train = load_svmlight_file(input_file.name)

        model = SGDClassifier(
            loss="log",
            penalty="l2",
            shuffle=True,
            verbose=0,
            learning_rate="optimal"
        )
        model.fit(x_train, y_train)
        # Save the model to file name (as pickle)
        pickle.dump(model, open(modelfile, "wb"))
    finally:
        os.remove(input_file.name)

print("LINEAR CLASSIFIER\n")
tp_sgd_clf = TransitionParser('arc-standard')
tp_sgd_clf.train = types.MethodType(sgd_train,tp_sgd_clf)
tp_sgd_clf.train(train_dataset, 'tp_sgd_clf.model', verbose=False)
parses_sgd_clf = tp_sgd_clf.parse(test_dataset, 'tp_sgd_clf.model')
de_sgd_clf = DependencyEvaluator(parses_sgd_clf, test_dataset)
las_sgd_clf, uas_sgd_clf = de_sgd_clf.eval()
print('modified classifier labelled attachment score',las_sgd_clf)
print('modified classifier unlabelled attachment score',uas_sgd_clf)

LINEAR CLASSIFIER

 Number of training examples : 100
 Number of valid (projective) examples : 100
modified classifier labelled attachment score 0.6833333333333333
modified classifier unlabelled attachment score 0.6833333333333333


In [66]:
import pickle
import tempfile
try:
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import load_svmlight_file
except ImportError:
    pass

def rf_train(self, depgraphs, modelfile, verbose=True):
    """
    :param depgraphs : list of DependencyGraph as the training data
    :type depgraphs : DependencyGraph
    :param modelfile : file name to save the trained model
    :type modelfile : str
    """

    try:
        input_file = tempfile.NamedTemporaryFile(
            prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
        )

        if self._algorithm == self.ARC_STANDARD:
            self._create_training_examples_arc_std(depgraphs, input_file)
        else:
            self._create_training_examples_arc_eager(depgraphs, input_file)

        input_file.close()
        # Using the temporary file to train the libsvm classifier
        x_train, y_train = load_svmlight_file(input_file.name)

        model = RandomForestClassifier(
            n_estimators=100,
            criterion="entropy",
            max_features="log2"
        )
        
        model.fit(x_train, y_train)
        # Save the model to file name (as pickle)
        pickle.dump(model, open(modelfile, "wb"))
    finally:
        os.remove(input_file.name)

print("RANDOM FOREST CLASSIFIER\n")
tp_rf_clf = TransitionParser('arc-standard')
tp_rf_clf.train = types.MethodType(rf_train,tp_rf_clf)
tp_rf_clf.train(train_dataset, 'tp_rf_clf.model', verbose=False)
parses_rf_clf = tp_rf_clf.parse(test_dataset, 'tp_rf_clf.model')
de_rf_clf = DependencyEvaluator(parses_rf_clf, test_dataset)
las_rf_clf, uas_rf_clf = de_rf_clf.eval()
print('modified classifier labelled attachment score',las_rf_clf)
print('modified classifier unlabelled attachment score',uas_rf_clf)

RANDOM FOREST CLASSIFIER

 Number of training examples : 100
 Number of valid (projective) examples : 100
modified classifier labelled attachment score 0.7583333333333333
modified classifier unlabelled attachment score 0.7583333333333333


In [67]:
import pickle
import tempfile
try:
    from sklearn.neural_network import MLPClassifier
    from sklearn.datasets import load_svmlight_file
except ImportError:
    pass

def mlp_train(self, depgraphs, modelfile, verbose=True):
    """
    :param depgraphs : list of DependencyGraph as the training data
    :type depgraphs : DependencyGraph
    :param modelfile : file name to save the trained model
    :type modelfile : str
    """

    try:
        input_file = tempfile.NamedTemporaryFile(
            prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False
        )

        if self._algorithm == self.ARC_STANDARD:
            self._create_training_examples_arc_std(depgraphs, input_file)
        else:
            self._create_training_examples_arc_eager(depgraphs, input_file)

        input_file.close()
        # Using the temporary file to train the libsvm classifier
        x_train, y_train = load_svmlight_file(input_file.name)
        
        model = MLPClassifier(
            activation="relu", 
            learning_rate="adaptive",
            solver='sgd', 
            nesterovs_momentum=True,
            alpha=1e-5,
            hidden_layer_sizes=(20,50,20), 
            random_state=1
            )


        model.fit(x_train, y_train)
        # Save the model to file name (as pickle)
        pickle.dump(model, open(modelfile, "wb"))
    finally:
        os.remove(input_file.name)

print("MLP CLASSIFIER\n")
tp_mlp_clf = TransitionParser('arc-standard')
tp_mlp_clf.train = types.MethodType(mlp_train,tp_mlp_clf)
tp_mlp_clf.train(train_dataset, 'tp_mlp_clf.model', verbose=False)
parses_mlp_clf = tp_mlp_clf.parse(test_dataset, 'tp_mlp_clf.model')
de_mlp_clf = DependencyEvaluator(parses_mlp_clf, test_dataset)
las_mlp_clf, uas_mlp_clf = de_mlp_clf.eval()
print('modified classifier labelled attachment score',las_mlp_clf)
print('modified classifier unlabelled attachment score',uas_mlp_clf)

MLP CLASSIFIER

 Number of training examples : 100
 Number of valid (projective) examples : 100




modified classifier labelled attachment score 0.7708333333333334
modified classifier unlabelled attachment score 0.7708333333333334
