## N-Grams Modeling

***

In [1]:
import random
import pandas as pd

import nltk

from nltk.tokenize import word_tokenize
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm import MLE


from nltk.lm.preprocessing import padded_everygram_pipeline


In [2]:
file_content = open("textfile.txt", encoding='utf-8').read()

In [3]:
file_content;

In [4]:
text = str(file_content) #Convert to string

In [5]:
tokens = nltk.word_tokenize(text)

In [6]:
tokens;

### Data Statistics

In [7]:
sents = nltk.sent_tokenize(text)
print("The number of sentences is", len(sents)) 
#prints the number of sentences

The number of sentences is 1


In [8]:
words = nltk.word_tokenize(text)
print("The number of tokens is", len(words)) 
#prints the number of tokens

The number of tokens is 209451


In [9]:
average_tokens = round(len(words)/len(sents))
print("The average number of tokens per sentence is",
average_tokens) 
#prints the average number of tokens per sentence

The average number of tokens per sentence is 209451


In [10]:
unique_tokens = set(words)
print("The number of unique tokens are", len(unique_tokens)) 
#prints the number of unique tokens

The number of unique tokens are 24975


### N-Grams = 1

In [11]:
list(ngrams(tokens,1))

[('given',),
 ('heart',),
 ('away',),
 ('sordid',),
 ('boon',),
 ('start',),
 ('side',),
 ('sugar',),
 ('sweet',),
 ('asian',),
 ('community',),
 ('need',),
 ('next',),
 ('thing',),
 ('know',),
 ('asian',),
 ('kid',),
 ('following',),
 ('around',),
 ('know',),
 ('shit',),
 ('boy',),
 ('wanted',),
 ('inside',),
 ('especially',),
 ('two',),
 ('particular',),
 ('thought',),
 ('wonderful',),
 ('sickened',),
 ('even',),
 ('wonderful',),
 ('reminds',),
 ('quote',),
 ('wwii',),
 ('era',),
 ('forget',),
 ('go',),
 ('say',),
 ('something',),
 ('like',),
 ('hero',),
 ('everyone',),
 ('around',),
 ('acting',),
 ('badly',),
 ('someone',),
 ('humanitarian',),
 ('award',),
 ('nicholas',),
 ('brain',),
 ('run',),
 ('excellent',),
 ('processing',),
 ('data',),
 ('concrete',),
 ('fact',),
 ('work',),
 ('best',),
 ('thing',),
 ('quantify',),
 ('run',),
 ('almost',),
 ('insurmountable',),
 ('challenge',),
 ('trying',),
 ('rationalize',),
 ('existence',),
 ('exceptional',),
 ('infant',),
 ('daughter',),
 

### N-Grams = 2

In [12]:
list(ngrams(tokens,2))

[('given', 'heart'),
 ('heart', 'away'),
 ('away', 'sordid'),
 ('sordid', 'boon'),
 ('boon', 'start'),
 ('start', 'side'),
 ('side', 'sugar'),
 ('sugar', 'sweet'),
 ('sweet', 'asian'),
 ('asian', 'community'),
 ('community', 'need'),
 ('need', 'next'),
 ('next', 'thing'),
 ('thing', 'know'),
 ('know', 'asian'),
 ('asian', 'kid'),
 ('kid', 'following'),
 ('following', 'around'),
 ('around', 'know'),
 ('know', 'shit'),
 ('shit', 'boy'),
 ('boy', 'wanted'),
 ('wanted', 'inside'),
 ('inside', 'especially'),
 ('especially', 'two'),
 ('two', 'particular'),
 ('particular', 'thought'),
 ('thought', 'wonderful'),
 ('wonderful', 'sickened'),
 ('sickened', 'even'),
 ('even', 'wonderful'),
 ('wonderful', 'reminds'),
 ('reminds', 'quote'),
 ('quote', 'wwii'),
 ('wwii', 'era'),
 ('era', 'forget'),
 ('forget', 'go'),
 ('go', 'say'),
 ('say', 'something'),
 ('something', 'like'),
 ('like', 'hero'),
 ('hero', 'everyone'),
 ('everyone', 'around'),
 ('around', 'acting'),
 ('acting', 'badly'),
 ('badly', 

### N-Grams = 3

In [13]:
list(ngrams(tokens,3))

[('given', 'heart', 'away'),
 ('heart', 'away', 'sordid'),
 ('away', 'sordid', 'boon'),
 ('sordid', 'boon', 'start'),
 ('boon', 'start', 'side'),
 ('start', 'side', 'sugar'),
 ('side', 'sugar', 'sweet'),
 ('sugar', 'sweet', 'asian'),
 ('sweet', 'asian', 'community'),
 ('asian', 'community', 'need'),
 ('community', 'need', 'next'),
 ('need', 'next', 'thing'),
 ('next', 'thing', 'know'),
 ('thing', 'know', 'asian'),
 ('know', 'asian', 'kid'),
 ('asian', 'kid', 'following'),
 ('kid', 'following', 'around'),
 ('following', 'around', 'know'),
 ('around', 'know', 'shit'),
 ('know', 'shit', 'boy'),
 ('shit', 'boy', 'wanted'),
 ('boy', 'wanted', 'inside'),
 ('wanted', 'inside', 'especially'),
 ('inside', 'especially', 'two'),
 ('especially', 'two', 'particular'),
 ('two', 'particular', 'thought'),
 ('particular', 'thought', 'wonderful'),
 ('thought', 'wonderful', 'sickened'),
 ('wonderful', 'sickened', 'even'),
 ('sickened', 'even', 'wonderful'),
 ('even', 'wonderful', 'reminds'),
 ('wonderful

In [14]:
test = tokens[:10000]

In [15]:
type(test)

list

### Model Training for 3 N-Grams

In [16]:
# Preprocess the tokenized text for 3-grams language modelling

n = 3

train_data, padded_sents = padded_everygram_pipeline(n, test)

In [17]:
model = MLE(order=3)

In [18]:
len(model.vocab)

0

In [19]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 34 items>


In [20]:
len(model.vocab)

34

In [21]:
model.vocab.lookup(test[0])

'<UNK>'

In [22]:
model.vocab.lookup('heart is never random lah .'.split())

('<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>')

In [23]:
print(model.counts)

<NgramCounter with 3 ngram orders and 264657 ngrams>


In [24]:
model.counts['heart']

0

In [25]:
model.score('away')

0.0

### Generation using N-gram Language Model

In [26]:
print(model.generate(20, random_seed=7))

['<s>', '<s>', 'p', 'a', 'r', 'd', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']


In [27]:
# Function to extract words from model results

from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

### Generating 3 random results

In [28]:
generate_sent(model, 20, random_seed=7)

'p a r d'

In [29]:
generate_sent(model, 20, random_seed=23)

't u r a p h i l d'

In [30]:
generate_sent(model, 20, random_seed=45)

'b e a'

### Load the subset 10000 lines dataset

In [31]:
df = pd.read_csv("blogdata.csv")

In [32]:
df

Unnamed: 0,words
0,"We have given our hearts away, a sordid boon !”"
1,1. Start it on the side
2,"Sugar’s sweet, so is she,"
3,So because the Asian community was so by need ...
4,Nicholas’ brain runs on and is excellent at pr...
...,...
9995,Perfect timing for Friday Fragments! I have a ...
9996,Herbert paid his debt to society by twiddling ...
9997,20. A Warning From The Sun
9998,Sherry and John from Young House Love are team...


In [33]:
blog_corpus = list(df['words'].apply(word_tokenize))

In [34]:
blog_corpus;

In [35]:
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, blog_corpus)

In [36]:
blog_model = MLE(n) # Lets train a 3-grams model, previously we set n=3
blog_model.fit(train_data, padded_sents)

### Generating 3 random results

In [37]:
generate_sent(blog_model, num_words=20, random_seed=42)

'money), Susheela Raman, Lila Downs, Nelson Mandela, Aziza Mustafa Zadeh, Margaret engages in an'

In [38]:
generate_sent(blog_model, num_words=10, random_seed=0)

'the reverse of the internet . Or is it generally'

In [39]:
generate_sent(blog_model, num_words=50, random_seed=10)

'incredible hostesses also asked us to new heights – and changing it to 215 countries around the corner . From the parking, the word “ home ” and filled with excitement!!!'

***

#### Coded and submitted by Dennis Lam 2021