#### **Student Name**: Dylan Govender
#### ***COMP316 - Natural Language Processing***
#### ***Assignment II***

***Installing important libraries and modules:***

In [1]:
!pip install nltk
!pip install dill

import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

Collecting dill
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.7


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

***Getting n-gram size and user input:***

In [2]:
from nltk.util import ngrams
from nltk.util import pad_sequence

from nltk.tokenize import word_tokenize, sent_tokenize

from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline

#user input text
text = input("User input: ")

#user specify n-gram size
n = int(input("\nN = "))

#removing start tag and end tag because we can use padding and tokenizing
if (('<s>' in text.lower()) or ('</s>' in text.lower())):
    text = text.replace('<s>', '')
    text = text.replace('</s>', '')

#tokenizing the text
tokenized_text = [list(map(str, word_tokenize(sent))) for sent in sent_tokenize(text)]

print("\nUser's tokenized text: ")
print(tokenized_text[0])

#padding text by adding start tag and end tag
padded_text = list(pad_sequence(tokenized_text[0], pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=n))

print("\nPadding text: ")
print(padded_text)

print("\nPadding text into sequence:")
print(list(ngrams(padded_text, n=n)))

print("\nFlattening text:")
flattened_text = list(flatten(pad_both_ends(sent, n=n) for sent in tokenized_text))
print(flattened_text)

print("\nN-Gram(s):")
n_grams = ngrams(flattened_text, n)
for n_gram in n_grams:
    print(n_gram)

#EXAMPLE:
#USER INPUT: Language users never choose words randomly, and language is essentially non-random.
#USER INPUT: <s> language is never random </s>

#Program can work with or without specifying start tag '<s>' or end tag '</s>'

User input: Language users never choose words randomly, and language is essentially non-random.

N = 3

User's tokenized text: 
['Language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.']

Padding text: 
['<s>', '<s>', 'Language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.', '</s>', '</s>']

Padding text into sequence:
[('<s>', '<s>', 'Language'), ('<s>', 'Language', 'users'), ('Language', 'users', 'never'), ('users', 'never', 'choose'), ('never', 'choose', 'words'), ('choose', 'words', 'randomly'), ('words', 'randomly', ','), ('randomly', ',', 'and'), (',', 'and', 'language'), ('and', 'language', 'is'), ('language', 'is', 'essentially'), ('is', 'essentially', 'non-random'), ('essentially', 'non-random', '.'), ('non-random', '.', '</s>'), ('.', '</s>', '</s>')]

Flattening text:
['<s>', '<s>', 'Language', 'users', 'never', 'choose', 'words', 'randomly', 

***Initialising a corpus:***

In [3]:
import os
import io
import requests

#Text version: https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
#Example from website: https://www.kaggle.com/code/alvations/n-gram-language-model-with-nltk

if os.path.isfile('language-never-random.txt'):
    with io.open('language-never-random.txt', encoding='utf8') as fin:
        corpus = fin.read();
else:
    url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
    corpus = requests.get(url).content.decode('utf8')
    with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
        fout.write(corpus)

print("Part of corpus:")
print(corpus[:979])

Part of corpus:
                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a rela-
tion between two phenomena is demonstrably non-random, does not sup-
port the inference that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis test-
ing has been used, and show how it has often led to unhelpful or mislead-
ing results.



***Processing the data from the corpus:***

In [4]:
from nltk.util import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.lm.preprocessing import padded_everygram_pipeline

#tokenizing the corpus
tokenized_corpus = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(corpus)]

#display tokenized corpus first sentence
print("\nTokenized corpus: ")
print(tokenized_corpus[0])

#pipeline the corpus
print("\nPipelining corpus:")
training, padding = padded_everygram_pipeline(n, tokenized_corpus)

#display pipeline(s)
count = 0;
for ngram in training:
    count = count + 1
    if (count <= 10):
        print(list(ngram))

#display number of pipeline sequence(s)
print("\n"  + str(count) + " total pipelined sequence(s).") #Too many to display - will just output some of the pipeline sequence(s)

print("\nCorpus vocabulary:")
print(list(padding))


Tokenized corpus: 
['language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.']

Pipelining corpus:
[('<s>',), ('<s>', '<s>'), ('<s>', '<s>', 'language'), ('<s>',), ('<s>', 'language'), ('<s>', 'language', 'is'), ('language',), ('language', 'is'), ('language', 'is', 'never'), ('is',), ('is', 'never'), ('is', 'never', ','), ('never',), ('never', ','), ('never', ',', 'ever'), (',',), (',', 'ever'), (',', 'ever', ','), ('ever',), ('ever', ','), ('ever', ',', 'ever'), (',',), (',', 'ever'), (',', 'ever', ','), ('ever',), ('ever', ','), ('ever', ',', 'random'), (',',), (',', 'random'), (',', 'random', 'adam'), ('random',), ('random', 'adam'), ('random', 'adam', 'kilgarriff'), ('adam',), ('adam', 'kilgarriff'), ('adam', 'kilgarriff', 'abstract'), ('kilgarriff',), ('kilgarriff', 'abstract'), ('kilgarriff', 'abstract', 'language

***Training the corpus:***

In [5]:
from nltk.lm import MLE

trained, padded = padded_everygram_pipeline(n, tokenized_corpus)

#using Maximum Likelihood Estimater(MLE) to train an n-gram model
#with a user-defined n-gram size
model = MLE(n)
model.fit(trained, padded)

#check if corpus is trained
print(model.counts)
print(model.vocab)
print(model.vocab.lookup(tokenized_corpus[0]))

#testing whether a particular word is in the corpus
#automatically replace words not in the vocabulary with '<UNK>'
print("\nTesting if a word is in the corpus: ")
word = 'lah'
sentence = 'language is never random ' + word + ' .'
print(model.vocab.lookup(sentence.split()))

<NgramCounter with 3 ngram orders and 19611 ngrams>
<Vocabulary with cutoff=1 unk_label='<UNK>' and 1391 items>
('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')

Testing if a word is in the corpus: 
('language', 'is', 'never', 'random', '<UNK>', '.')


***Using the trained model and computing a matrix of probabilities:***

In [9]:
import numpy as np
from tabulate import tabulate
from tkinter.constants import Y

n = n
n_grams = list(zip(ngrams(flattened_text, n)))

#calculate the probability
def probability(of, given):
    p = round(float(model.score(of, given.split())), 4)
    return p

#for convention to look neat
def display(string, from_conv, to_conv, x, y):
    space = from_conv
    if (n >= 3):
        space = to_conv
    if (x < y):
        return string + space
    return string

#for a matrix to get it's transpose - alternatively np.transpose(matrix)
def transpose(m):
    return [[m[j][i] for j in range(len(m))] for i in range(len(m[0]))]

stringy = []

#build the language model using a matrix
def table(change):
    matrix = []
    probabilities = []
    rows, columns = (len(n_grams) - 1, len(n_grams) - 1)

    for i in range(rows):
        given = ''
        column = []
        clone = []
        of = n_grams[i][0][change]

        for k in range(change):
            given = display(given + n_grams[i][0][k], '', ' ', k, change - 1)
        column.append(given)

        for j in range(columns):
            if (i < 1):
                column.append(n_grams[j + 1][0][change])
            else:
                if (j > 0):
                    given = ''
                    for y in range(change):
                        given = display(given + n_grams[j][0][y], '', ' ', y, change - 1)
                    stringy.append('P(\'' + of + '\'|\'' + given + '\') = ' + str(probability(of, given)))
                    column.append(str(probability(of, given)))
                    clone.append(str(probability(of, given)))
        stringy.append("")
        matrix.append(column)
        probabilities.append(clone)

    print("\nLanguage Model:")
    matrix[0][0] = ''
    matrix = np.array(matrix, dtype=object)
    probabilities = np.array(probabilities, dtype=object)
    probabilities = np.delete(probabilities, 0)
    probabilities = transpose(probabilities)

    for row in range(rows):
        for col in range(columns):
            if (row > 0 and col > 0):
                matrix[row][col] = probabilities[row-1][col-1]

    print(tabulate(matrix, headers='firstrow', tablefmt='fancy_grid'))

In [10]:
print("Probability calculation(s) and checking: ")
table(n - 1)

Probability calculation(s) and checking: 

Language Model:
╒════════════════════════╤═════════╤═════════╤══════════╤═════════╤════════════╤═════╤═══════╤════════════╤══════╤═══════════════╤══════════════╤═════╤════════╕
│                        │   users │   never │   choose │   words │   randomly │   , │   and │   language │   is │   essentially │   non-random │   . │   </s> │
╞════════════════════════╪═════════╪═════════╪══════════╪═════════╪════════════╪═════╪═══════╪════════════╪══════╪═══════════════╪══════════════╪═════╪════════╡
│ <s> Language           │       0 │  0      │        0 │       0 │          0 │   0 │     0 │       0    │    0 │        0      │       0      │   0 │      0 │
├────────────────────────┼─────────┼─────────┼──────────┼─────────┼────────────┼─────┼───────┼────────────┼──────┼───────────────┼──────────────┼─────┼────────┤
│ Language users         │       0 │  0      │        0 │       0 │          0 │   0 │     0 │       0    │    0 │        0      │      

In [15]:
for i in range(len(stringy)):
    print(stringy[i])


P('users'|'<s> Language') = 0.0
P('users'|'Language users') = 0.0
P('users'|'users never') = 0.0
P('users'|'never choose') = 0.0
P('users'|'choose words') = 0.0
P('users'|'words randomly') = 0.0
P('users'|'randomly ,') = 0.0
P('users'|', and') = 0.0
P('users'|'and language') = 0.0
P('users'|'language is') = 0.0
P('users'|'is essentially') = 0.0
P('users'|'essentially non-random') = 0.0
P('users'|'non-random .') = 0.0

P('never'|'<s> Language') = 0.0
P('never'|'Language users') = 0.0
P('never'|'users never') = 0.0
P('never'|'never choose') = 0.0
P('never'|'choose words') = 0.0
P('never'|'words randomly') = 0.0
P('never'|'randomly ,') = 0.0
P('never'|', and') = 0.0
P('never'|'and language') = 0.0
P('never'|'language is') = 0.6364
P('never'|'is essentially') = 0.0
P('never'|'essentially non-random') = 0.0
P('never'|'non-random .') = 0.0

P('choose'|'<s> Language') = 0.0
P('choose'|'Language users') = 0.0
P('choose'|'users never') = 1.0
P('choose'|'never choose') = 0.0
P('choose'|'choose 