In [98]:
# loading the required libraries
import pandas as pd
import numpy as np
import re
import pickle
import random
from tqdm import tqdm

pd.set_option('display.width', 500)

In [99]:
# mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [100]:
def print_df(df):
    print(df.head())
    print(df.shape)
    print(df.info())

In [101]:
# open text file and read in data
#with open("drive/My Drive/dialogs_dataset", "rb") as f:
#with open("drive/MyDrive/Colab Notebooks/Project - Next  Word Recommender System/dialogs_dataset", "rb") as f:
#    dialogs = pickle.load(f)

df_reuters = pd.read_csv("drive/MyDrive/Colab Notebooks/Project - Next  Word Recommender System/sample_reuters_dataset.csv")
print_df(df_reuters)

   sentence_number                                      sentence_text
0                0  ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...
1                1  They told Reuter correspondents in Asian capit...
2                2  But some exporters said that while the conflic...
3                3  The U . S . Has said it will impose 300 mln dl...
4                4  Unofficial Japanese estimates put the impact o...
(10000, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   sentence_number  10000 non-null  int64 
 1   sentence_text    10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 156.4+ KB
None


In [102]:
# text cleaning
def clean_data(text):
    # remove everything except alphabets, ' and white spaces
    text = re.sub("[^a-zA-Z' ]", "", text)
    # convert text to lowercase
    text = text.lower()
    return text

df_reuters["clean_sentences"] = df_reuters['sentence_text'].apply(clean_data)
print_df(df_reuters)

   sentence_number                                      sentence_text                                    clean_sentences
0                0  ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...  asian exporters fear damage from u  s  japan r...
1                1  They told Reuter correspondents in Asian capit...  they told reuter correspondents in asian capit...
2                2  But some exporters said that while the conflic...  but some exporters said that while the conflic...
3                3  The U . S . Has said it will impose 300 mln dl...  the u  s  has said it will impose  mln dlrs of...
4                4  Unofficial Japanese estimates put the impact o...  unofficial japanese estimates put the impact o...
(10000, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   sentence_number  10000 non-null  int64 
 1   sentence_text   

In [103]:
# creating the vocabulary
# get list of all the words
all_words = " ".join(df_reuters['clean_sentences']).split()
print(type(all_words))

words_dict = {}

# add word-count pair to the dictionary
for word in all_words:
    # check if the word is already in dictionary
    if word in words_dict:
        # increment count of word by 1
        words_dict[word] = words_dict[word] + 1
    else:
        # add the word to dictionary with count 1
        words_dict[word] = 1

print(words_dict)

# prepare a dataframe
words_df = pd.DataFrame({'word':list(words_dict.keys()), 'count':list(words_dict.values())})

# sort words by their count in increasing order
words_df = words_df.sort_values(by = ['count'])

# reset dataframe index
words_df.reset_index(inplace = True, drop=True)

# words with least frequency
print(words_df.head())

# words with highest frequency
print(words_df.tail())

# vocabulary size
print("vocabulary size:", len(words_df))

<class 'list'>
          word  count
0        ulcer      1
1        gaons      1
2  securitiesd      1
3   unfiltered      1
4   preceeding      1
       word  count
12575  said   4649
12576    in   5070
12577    to   6337
12578    of   6671
12579   the  12496
vocabulary size: 12580


In [104]:
# creating an empty dataframe
dataset = pd.DataFrame()

# adding cleaned sentences in the dataframe
dataset['Sentences'] = df_reuters['clean_sentences']

# first 20 cleaned sentences
print(dataset.head(10))

# using .split() to get tokens from the sentence
dataset['Sentences'][0].split()

                                           Sentences
0  asian exporters fear damage from u  s  japan r...
1  they told reuter correspondents in asian capit...
2  but some exporters said that while the conflic...
3  the u  s  has said it will impose  mln dlrs of...
4  unofficial japanese estimates put the impact o...
5   we wouldn ' t be able to do business  said a ...
6   if the tariffs remain in place for any length...
7  in taiwan  businessmen and officials are also ...
8       we are aware of the seriousness of the u  s 
9  threat against japan because it serves as a wa...


['asian',
 'exporters',
 'fear',
 'damage',
 'from',
 'u',
 's',
 'japan',
 'rift',
 'mounting',
 'trade',
 'friction',
 'between',
 'the',
 'u',
 's',
 'and',
 'japan',
 'has',
 'raised',
 'fears',
 'among',
 'many',
 'of',
 'asia',
 "'",
 's',
 'exporting',
 'nations',
 'that',
 'the',
 'row',
 'could',
 'inflict',
 'far',
 'reaching',
 'economic',
 'damage',
 'businessmen',
 'and',
 'officials',
 'said']

In [105]:
# function to create unigrams
# taking a sentence as input
def create_unigram(sentence):
    # creating tokens from the sentence
    tokens = sentence.split()
    # empty list to store the unigrams
    unigram_list = []
    # number of unigrams is equal to the number of tokens in the sentence
    for i in range(len(tokens)):
        # appending each unigram in the list
        unigram_list.append(tokens[i:i+1])
    # returning the unigram list for a sentence
    return unigram_list

In [106]:
# function to create bigrams
def create_bigram(sentence):
    tokens = sentence.split()
    bigram_list = []
    # number of bigrams is one less than the number of tokens in the sentence
    for i in range(len(tokens)-1):
        bigram_list.append(tokens[i:i+2])
        #print(tokens[i:i+2])
    return bigram_list

In [107]:
# function to create trigrams
def create_trigram(sentence):
    tokens = sentence.split()
    trigram_list = []
    # number of trigrams is two less than the number of tokens in the sentence
    for i in range(len(tokens)-2):
        trigram_list.append(tokens[i:i+3])
    return trigram_list

In [108]:
# creating unigrams for all the sentences in the dataset
final_unigram = []
# for each sentence
for i in range(dataset.shape[0]):
    # using the defined unigram function to create unigrams
    final_unigram.append(create_unigram(dataset['Sentences'][i]))

# adding the unigram in a seperate column in the dataset
dataset['unigram'] = final_unigram

In [109]:
# creating bigrams for all the sentences in the dataset
final_bigram = []
for i in range(dataset.shape[0]):
    final_bigram.append(create_bigram(dataset['Sentences'][i]))

dataset['bigram'] = final_bigram

In [110]:
# creating trigrams for all the sentences in the dataset
final_trigram = []
for i in range(dataset.shape[0]):
    final_trigram.append(create_trigram(dataset['Sentences'][i]))

dataset['trigram'] = final_trigram

In [111]:
# first 20 rows of the dataset
dataset.head(20)

Unnamed: 0,Sentences,unigram,bigram,trigram
0,asian exporters fear damage from u s japan r...,"[[asian], [exporters], [fear], [damage], [from...","[[asian, exporters], [exporters, fear], [fear,...","[[asian, exporters, fear], [exporters, fear, d..."
1,they told reuter correspondents in asian capit...,"[[they], [told], [reuter], [correspondents], [...","[[they, told], [told, reuter], [reuter, corres...","[[they, told, reuter], [told, reuter, correspo..."
2,but some exporters said that while the conflic...,"[[but], [some], [exporters], [said], [that], [...","[[but, some], [some, exporters], [exporters, s...","[[but, some, exporters], [some, exporters, sai..."
3,the u s has said it will impose mln dlrs of...,"[[the], [u], [s], [has], [said], [it], [will],...","[[the, u], [u, s], [s, has], [has, said], [sai...","[[the, u, s], [u, s, has], [s, has, said], [ha..."
4,unofficial japanese estimates put the impact o...,"[[unofficial], [japanese], [estimates], [put],...","[[unofficial, japanese], [japanese, estimates]...","[[unofficial, japanese, estimates], [japanese,..."
5,we wouldn ' t be able to do business said a ...,"[[we], [wouldn], ['], [t], [be], [able], [to],...","[[we, wouldn], [wouldn, '], [', t], [t, be], [...","[[we, wouldn, '], [wouldn, ', t], [', t, be], ..."
6,if the tariffs remain in place for any length...,"[[if], [the], [tariffs], [remain], [in], [plac...","[[if, the], [the, tariffs], [tariffs, remain],...","[[if, the, tariffs], [the, tariffs, remain], [..."
7,in taiwan businessmen and officials are also ...,"[[in], [taiwan], [businessmen], [and], [offici...","[[in, taiwan], [taiwan, businessmen], [busines...","[[in, taiwan, businessmen], [taiwan, businessm..."
8,we are aware of the seriousness of the u s,"[[we], [are], [aware], [of], [the], [seriousne...","[[we, are], [are, aware], [aware, of], [of, th...","[[we, are, aware], [are, aware, of], [aware, o..."
9,threat against japan because it serves as a wa...,"[[threat], [against], [japan], [because], [it]...","[[threat, against], [against, japan], [japan, ...","[[threat, against, japan], [against, japan, be..."


In [112]:
# sample sentence
print(dataset['Sentences'][0])
print(dataset['Sentences'][1])
print(dataset['Sentences'][2])

# unigram of the sentence
print("unigram", dataset['unigram'][0])
print("unigram", dataset['unigram'][1])

# bigram of the sentence
print("bigram", dataset['bigram'][0])
print("bigram", dataset['bigram'][1])

# trigram of the sentence
print("trigram", dataset['trigram'][0])
print("trigram", dataset['trigram'][1])

asian exporters fear damage from u  s  japan rift mounting trade friction between the u  s  and japan has raised fears among many of asia ' s exporting nations that the row could inflict far  reaching economic damage  businessmen and officials said 
they told reuter correspondents in asian capitals a u  s  move against japan might boost protectionist sentiment in the u  s  and lead to curbs on american imports of their products 
but some exporters said that while the conflict would hurt them in the long  run  in the short  term tokyo ' s loss might be their gain 
unigram [['asian'], ['exporters'], ['fear'], ['damage'], ['from'], ['u'], ['s'], ['japan'], ['rift'], ['mounting'], ['trade'], ['friction'], ['between'], ['the'], ['u'], ['s'], ['and'], ['japan'], ['has'], ['raised'], ['fears'], ['among'], ['many'], ['of'], ['asia'], ["'"], ['s'], ['exporting'], ['nations'], ['that'], ['the'], ['row'], ['could'], ['inflict'], ['far'], ['reaching'], ['economic'], ['damage'], ['businessmen'], ['

In [113]:
# for defining the N-gram model
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance
for i in range(dataset.shape[0]):
    # for each trigram pair
    for w1, w2, w3 in create_trigram(dataset['Sentences'][i]):
        # count the occurance of word 3, given word 1 and word 2
        model[(w1, w2)][w3] += 1


In [114]:
print(dict(model["they", "told"]))
print(dict(model["they", "are"]))

{'reuter': 1, 'reporters': 1, 'reuters': 1}
{'more': 1, 'prepared': 1, 'giving': 1, 'rumors': 1, 'to': 1, 'pleased': 1, 'a': 1, 'not': 4, 'saying': 1, 'trading': 1, 'free': 1, 'today': 1, 'beginning': 1, 'at': 1, 'willing': 2, 'happy': 1, 'required': 1, 'polysaturated': 1, 'defeathered': 1, 'fired': 2, 'imported': 1, 'unprofitable': 1, 'being': 1, 'less': 1}


In [115]:
# creating the unigram list
unigram_dict = {}
for i in tqdm(range(dataset.shape[0])):
    # add word-count pair to the dictionary
    for word in dataset['unigram'][i]:
        # check if the word is already in dictionary
        if word[0] in unigram_dict:
            # increment count of word by 1
            unigram_dict[word[0]] = unigram_dict[word[0]] + 1
        else:
            # add the word to dictionary with count 1
            unigram_dict[word[0]] = 1

unigram_dict

100%|██████████| 10000/10000 [00:00<00:00, 46623.06it/s]


{'asian': 13,
 'exporters': 52,
 'fear': 8,
 'damage': 29,
 'from': 1369,
 'u': 1117,
 's': 2864,
 'japan': 441,
 'rift': 1,
 'mounting': 5,
 'trade': 549,
 'friction': 8,
 'between': 191,
 'the': 12496,
 'and': 4599,
 'has': 974,
 'raised': 70,
 'fears': 13,
 'among': 44,
 'many': 54,
 'of': 6671,
 'asia': 14,
 "'": 2094,
 'exporting': 12,
 'nations': 71,
 'that': 1376,
 'row': 3,
 'could': 291,
 'inflict': 1,
 'far': 55,
 'reaching': 7,
 'economic': 244,
 'businessmen': 15,
 'officials': 190,
 'said': 4649,
 'they': 518,
 'told': 237,
 'reuter': 27,
 'correspondents': 3,
 'in': 5070,
 'capitals': 3,
 'a': 4412,
 'move': 101,
 'against': 270,
 'might': 59,
 'boost': 45,
 'protectionist': 22,
 'sentiment': 10,
 'lead': 96,
 'to': 6337,
 'curbs': 12,
 'on': 1643,
 'american': 126,
 'imports': 242,
 'their': 230,
 'products': 200,
 'but': 650,
 'some': 278,
 'while': 164,
 'conflict': 3,
 'would': 926,
 'hurt': 11,
 'them': 58,
 'long': 119,
 'run': 21,
 'short': 87,
 'term': 120,
 'toky

In [116]:
# find the overall frequency of words in the corpus
counts = Counter(unigram_dict)
counts

Counter({'asian': 13,
         'exporters': 52,
         'fear': 8,
         'damage': 29,
         'from': 1369,
         'u': 1117,
         's': 2864,
         'japan': 441,
         'rift': 1,
         'mounting': 5,
         'trade': 549,
         'friction': 8,
         'between': 191,
         'the': 12496,
         'and': 4599,
         'has': 974,
         'raised': 70,
         'fears': 13,
         'among': 44,
         'many': 54,
         'of': 6671,
         'asia': 14,
         "'": 2094,
         'exporting': 12,
         'nations': 71,
         'that': 1376,
         'row': 3,
         'could': 291,
         'inflict': 1,
         'far': 55,
         'reaching': 7,
         'economic': 244,
         'businessmen': 15,
         'officials': 190,
         'said': 4649,
         'they': 518,
         'told': 237,
         'reuter': 27,
         'correspondents': 3,
         'in': 5070,
         'capitals': 3,
         'a': 4412,
         'move': 101,
         'against': 2

In [117]:
# vocabulary size
total_count = len(unigram_dict)
total_count

12580

In [118]:
# relative frequencies of each word
for word in counts:
    counts[word] /= float(total_count)

counts

Counter({'asian': 0.0010333863275039745,
         'exporters': 0.004133545310015898,
         'fear': 0.0006359300476947536,
         'damage': 0.002305246422893482,
         'from': 0.10882352941176471,
         'u': 0.08879173290937997,
         's': 0.22766295707472178,
         'japan': 0.035055643879173294,
         'rift': 7.94912559618442e-05,
         'mounting': 0.000397456279809221,
         'trade': 0.04364069952305247,
         'friction': 0.0006359300476947536,
         'between': 0.015182829888712241,
         'the': 0.993322734499205,
         'and': 0.36558028616852145,
         'has': 0.07742448330683625,
         'raised': 0.005564387917329093,
         'fears': 0.0010333863275039745,
         'among': 0.0034976152623211448,
         'many': 0.004292527821939587,
         'of': 0.5302861685214626,
         'asia': 0.0011128775834658188,
         "'": 0.16645468998410176,
         'exporting': 0.0009538950715421304,
         'nations': 0.005643879173290938,
         't

In [119]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [120]:
# predict the next word
dict(model["they", "told"])

{'reuter': 0.3333333333333333,
 'reporters': 0.3333333333333333,
 'reuters': 0.3333333333333333}

In [121]:
# another example
dict(model["they", "are"])

{'more': 0.034482758620689655,
 'prepared': 0.034482758620689655,
 'giving': 0.034482758620689655,
 'rumors': 0.034482758620689655,
 'to': 0.034482758620689655,
 'pleased': 0.034482758620689655,
 'a': 0.034482758620689655,
 'not': 0.13793103448275862,
 'saying': 0.034482758620689655,
 'trading': 0.034482758620689655,
 'free': 0.034482758620689655,
 'today': 0.034482758620689655,
 'beginning': 0.034482758620689655,
 'at': 0.034482758620689655,
 'willing': 0.06896551724137931,
 'happy': 0.034482758620689655,
 'required': 0.034482758620689655,
 'polysaturated': 0.034482758620689655,
 'defeathered': 0.034482758620689655,
 'fired': 0.06896551724137931,
 'imported': 0.034482758620689655,
 'unprofitable': 0.034482758620689655,
 'being': 0.034482758620689655,
 'less': 0.034482758620689655}