# Assignment 2

### Glory Odeyemi

#### 6-Feb-2023

### Install libraries

You can skip this step if you already have these libraries installed.

In [1]:
!pip install pytrec-eval-terrier
!pip install nltk



### Import libraries

This is an important step because some of the codes that depends on these libraries will give an error if the libraries are not imported.

In [2]:
import nltk
import itertools
from utils.top_k_success import top_k_tokens, success_at_k, average_k
from utils.n_gram_model import tokenize_corpus, train_model, save_model, load_model

### Download Brown corpus

We use the news genre of the brown corpus to train our n-Gram language model in this project.

In [3]:
nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package brown to /Users/new/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/new/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from nltk.corpus import brown
# brown.categories()
brown_corpus_tokens = brown.words(categories='news')
print(brown_corpus_tokens[:10])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']


In [5]:
print("Total number of tokens in the brown corpus news genre = ", len(brown_corpus_tokens))

Total number of tokens in the brown corpus news genre =  100554


In [6]:
brown_corpus_sents = brown.sents(categories='news')
print(brown_corpus_sents[:2])

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']]


In [7]:
print("Total number of sentences in the brown corpus news genre = ", len(brown_corpus_sents))

Total number of sentences in the brown corpus news genre =  4623


### Import Birkbeck corpus

Birkbeck spelling error corpus was used for this project. You can find it [here](https://ota.bodleian.ox.ac.uk/repository/xmlui/handle/20.500.12024/0643).

The [APPLING1DAT.643](https://github.com/gloryodeyemi) file out of the Birkbeck spelling error corpus by Roger Mitton was used.

In [8]:
birkbeck_data = []
with open('Data/APPLING1DAT.643', 'r') as file_data:
    for line in file_data:
        data = line.split()
        birkbeck_data.append(data)
birkbeck_data[:10]

[['$Punjabi'],
 ['strang', 'strange', 'I', 'felt', 'very', '*'],
 ['brake', 'break', 'at', '*', 'time'],
 ['brack', 'break', 'when', 'the', '*', 'was', 'finished'],
 ['weanter', 'winter', 'in', 'the', '*', 'when', 'it', 'was', 'snowing'],
 ['gost', 'ghost', 'I', 'thought', 'it', 'was', 'a', '*'],
 ['expect', 'except', 'everything', '*', 'the', 'houses'],
 ['$Tamil'],
 ['steped', 'stepped', 'when', 'I', 'first', '*'],
 ['streagh', 'strange', 'and', 'saw', '*', 'colow', 'people']]

In [9]:
# clean corpus to remove line with $
for ind_list in birkbeck_data:
    for item in ind_list:
        if(item.startswith('$')):
            birkbeck_data.remove(ind_list)
        
birkbeck_data[:10]

[['strang', 'strange', 'I', 'felt', 'very', '*'],
 ['brake', 'break', 'at', '*', 'time'],
 ['brack', 'break', 'when', 'the', '*', 'was', 'finished'],
 ['weanter', 'winter', 'in', 'the', '*', 'when', 'it', 'was', 'snowing'],
 ['gost', 'ghost', 'I', 'thought', 'it', 'was', 'a', '*'],
 ['expect', 'except', 'everything', '*', 'the', 'houses'],
 ['steped', 'stepped', 'when', 'I', 'first', '*'],
 ['streagh', 'strange', 'and', 'saw', '*', 'colow', 'people'],
 ['colow', 'coloured', 'and', 'saw', 'streagh', '*', 'people'],
 ['exclation', 'escalator', 'I', 'was', 'on', 'an', '*']]

In [10]:
print("Total number of errored words in Birbeck corpus = ", len(birkbeck_data))

Total number of errored words in Birbeck corpus =  198


### Tokenizing the Brown corpus

The brown corpus has to be tokenized before we can use it to train our language models

In [11]:
tokenized_corpus = tokenize_corpus(brown_corpus_sents)
print(tokenized_corpus[:2])

[['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', '``', 'that', 'any', 'irregularities', 'took', 'place', '.'], ['the', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'city', 'executive', 'committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'city', 'of', 'atlanta', '``', 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']]


### Training the language model

We will train and save n-Gram language models using the tokenized brown corpus for n={1,2,3,5,10}

In [12]:
n_list = [1, 2, 3, 5, 10]

for n in n_list:
    model = train_model(n, tokenized_corpus)
    save_model(n, model)

### Getting the top-k list of tokens and success at k

* Top-k list of tokens are the top most probable list of token that are retrieved by the language model.
* For every incorrect word in the birkbeck_data corpus, top-k tokens are returned, where k={1,5,10}.
* Success at k (s@k) measures whether the correct spelling of the word in the birkbeck_data corpus happens to be in the top-k (most probable) list of tokens that are retrieved by the language model.

**Sample test:** Two items in the birkbeck_data corpus will be used as test and sample result is shown.

In [13]:
sample_test = birkbeck_data[50:52]
top_k_result = []

for n in n_list:
    model_loaded = load_model(n)
    print("--------------")
    print(f"{n}-gram model: \n--------------")
    for data_row in sample_test:
        res = top_k_tokens(data_row, model_loaded, tokenized_corpus)
        print(f"Top-k probability: {sample_test.index(data_row) + 1}", res)
        print("")
        top_k_result.append(res)
    
    success = success_at_k(top_k_result)
    print("Success at k: ", success)
    print("")    

--------------
1-gram model: 
--------------
Top-k probability: 1 {'correct_word': 'hammer', 'incorrect_word': 'hamer', 'top_1': {'the': 0}, 'top_5': {'the': 0, 'fulton': 0, 'county': 0, 'grand': 0, 'jury': 0}, 'top_10': {'the': 0, 'fulton': 0, 'county': 0, 'grand': 0, 'jury': 0, 'said': 0, 'friday': 0, 'an': 0, 'investigation': 0, 'of': 0}}

Top-k probability: 2 {'correct_word': 'might', 'incorrect_word': 'mite', 'top_1': {'the': 0}, 'top_5': {'the': 0, 'fulton': 0, 'county': 0, 'grand': 0, 'jury': 0}, 'top_10': {'the': 0, 'fulton': 0, 'county': 0, 'grand': 0, 'jury': 0, 'said': 0, 'friday': 0, 'an': 0, 'investigation': 0, 'of': 0}}

Success at k:  {'hamer': {'success_at_1': 0, 'success_at_5': 0, 'success_at_10': 0}, 'mite': {'success_at_1': 0, 'success_at_5': 0, 'success_at_10': 0}}

--------------
2-gram model: 
--------------
Top-k probability: 1 {'correct_word': 'hammer', 'incorrect_word': 'hamer', 'top_1': {'the': 0}, 'top_5': {'the': 0, 'fulton': 0, 'county': 0, 'grand': 0, 'jur

### Evaluating all incorrect token in our birkbeck corpus and getting the average success at k for n={1,2,3,5,10}

In [14]:
top_k_result = []

for n in n_list:
    model_loaded = load_model(n)
    print("--------------")
    print(f"{n}-gram model: \n--------------")
    for data_row in birkbeck_data:
        res = top_k_tokens(data_row, model_loaded, tokenized_corpus)
        top_k_result.append(res)
    
    success = success_at_k(top_k_result)
    avg = average_k(success)
    print("Average success at k: ", avg)

--------------
1-gram model: 
--------------
Average success at k:  {'success_at_1': 0.0, 'success_at_5': 0.0, 'success_at_10': 0.0}
--------------
2-gram model: 
--------------
Average success at k:  {'success_at_1': 0.0, 'success_at_5': 0.005235602094240838, 'success_at_10': 0.010471204188481676}
--------------
3-gram model: 
--------------
Average success at k:  {'success_at_1': 0.010471204188481676, 'success_at_5': 0.020942408376963352, 'success_at_10': 0.02617801047120419}
--------------
5-gram model: 
--------------
Average success at k:  {'success_at_1': 0.010471204188481676, 'success_at_5': 0.020942408376963352, 'success_at_10': 0.02617801047120419}
--------------
10-gram model: 
--------------
Average success at k:  {'success_at_1': 0.010471204188481676, 'success_at_5': 0.020942408376963352, 'success_at_10': 0.02617801047120419}
