<a href="https://colab.research.google.com/github/fat-91/test/blob/main/Next_word_Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Steps to build the next word recommender system

1. Loading and exploring the dataset
2. Creating N-grams of the dialogue
3. Building the N-gram Language Model
4. Predicting the next word using N-gram Language Model

## 1. Loading and exploring the dataset

In [None]:
# loading the required libraries
import pandas as pd
import numpy as np
import re
import pickle
import random
from tqdm import tqdm

In [None]:
# mounting the drive
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
# open text file and read in data
import pandas as pd
df = pd.read_csv('sample_reuters_dataset.csv')  # Replace 'dialogs_dataset.csv' with your actual file name
print(df.head())  # This will display the first 5 rows


   sentence_number                                      sentence_text
0                0  ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...
1                1  They told Reuter correspondents in Asian capit...
2                2  But some exporters said that while the conflic...
3                3  The U . S . Has said it will impose 300 mln dl...
4                4  Unofficial Japanese estimates put the impact o...


In [None]:
sentences = df['sentence_text'].tolist()  # Extract the 'sentence_text' column and convert it into a list


In [None]:
random.sample(sentences, 10)

['However , the intervention failed to boost the U . S . Currency significantly from the 142 . 20 yen level , they added .',
 'MAGMA RAISES COPPER PRICE 0 . 25 CT TO 65 . 75 CTS Magma Copper Co , a subsidiary of Newmont Mining Corp , said it is raising its copper cathode price by 0 . 25 cent to 65 . 75 cents a lb , effective immediately .',
 'And resorting to higher interest rates could lead to recession , he said .',
 'It is expected to be considered by the full Senate by the end of the summer .',
 'J . Terence Murray , chairman and president of Fleet Financial , said , " Fleet \' s mortgage banking activities in particular continued to produce signficant income increases ( in the first quarter )."',
 "The February increase was less than Janauary ' s 0 . 4 pct increase but slightly above the average for the later months of 1986 .",
 'The Japanese shareholders , the Overseas Economic Cooperation Fund and 12 companies , are to invest another 24 billion yen raising capitalisation to 147 

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# If you haven't already downloaded these, run once:
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

sentences_clean = []

for i in sentences:
    # Remove everything except alphabets, apostrophes, and white spaces
    i = re.sub(r"[^a-zA-Z' ]", '', i)

    # Convert text to lowercase and normalize whitespace
    i = i.lower().strip()
    i = re.sub(r'\s+', ' ', i)

    # Tokenize the sentence into words
    words = i.split()

    # Lemmatize and remove stopwords
    cleaned_words = [
        lemmatizer.lemmatize(word)
        for word in words
        ##if word not in stop_words
    ]

    # Join words back into a cleaned sentence
    cleaned_sentence = ' '.join(cleaned_words)

    # Append to the cleaned sentences list
    sentences_clean.append(cleaned_sentence)

# Check results
print(sentences_clean[:5])  # print first 5 cleaned sentences for example

["asian exporter fear damage from u s japan rift mounting trade friction between the u s and japan ha raised fear among many of asia ' s exporting nation that the row could inflict far reaching economic damage businessmen and official said", 'they told reuter correspondent in asian capital a u s move against japan might boost protectionist sentiment in the u s and lead to curb on american import of their product', "but some exporter said that while the conflict would hurt them in the long run in the short term tokyo ' s loss might be their gain", "the u s ha said it will impose mln dlrs of tariff on import of japanese electronics good on april in retaliation for japan ' s alleged failure to stick to a pact not to sell semiconductor on world market at below cost", 'unofficial japanese estimate put the impact of the tariff at billion dlrs and spokesman for major electronics firm said they would virtually halt export of product hit by the new tax']


In [None]:
random.sample(sentences_clean, 10)


['the company said shareholder at the annual meeting approved an increase in authorized common share to mln from mln and a name change to alfa corp',
 "uae trade current account surplus narrow the united arab emirate uae recorded a trade surplus of billion dirham narrowing from billion in the central bank ' s latest bulletin show",
 'while the automobile industry remained unchanged from previously boosted level consumer good production grew and wa expected to accelerate except in the area of domestic appliance',
 'the sixth ita will expire at the end of june unless a two third majority of member vote for an extension',
 'group raise computer memory lt cmin stake a shareholder group led by far hill n j investor natalie koether said it raised it stake in computer memory inc to share or pct of the total outstanding from share or pct',
 'jun jun',
 'the smelter also produced cadmium bismuth and indium',
 'retailer are not planning for much inventory are being kept lean markdowns are lower 

In [None]:
from sklearn.model_selection import train_test_split

# Split into train and test (80/20)
train_sentences, test_sentences = train_test_split(sentences_clean, test_size=0.2, random_state=42)

# Check sizes
print(f"Training sentences: {len(train_sentences)}")
print(f"Testing sentences: {len(test_sentences)}")


Training sentences: 8000
Testing sentences: 2000


In [None]:
# creating the vocabulary
# get list of all the words
all_words = " ".join(train_sentences).split()

words_dict = {}

# add word-count pair to the dictionary
for word in all_words:
    # check if the word is already in dictionary
    if word in words_dict:
        # increment count of word by 1
        words_dict[word] = words_dict[word] + 1
    else:
        # add the word to dictionary with count 1
        words_dict[word] = 1

In [None]:
# word dictionary
words_dict

{'tranche': 4,
 'increased': 59,
 'tonne': 503,
 'south': 100,
 'korea': 33,
 'added': 188,
 'possible': 60,
 'destination': 14,
 'showa': 5,
 'denko': 3,
 'export': 343,
 'aluminium': 19,
 'casting': 2,
 'equipment': 34,
 'lt': 1200,
 'ltd': 212,
 'said': 3701,
 'exporting': 9,
 'billet': 1,
 'technology': 49,
 'country': 194,
 'recently': 35,
 'begun': 6,
 'smelting': 5,
 'note': 232,
 'company': 840,
 "'": 1683,
 'full': 69,
 'name': 47,
 'gebrueder': 2,
 'sulzer': 2,
 'ag': 12,
 'sulz': 1,
 'z': 9,
 'romero': 8,
 'philippine': 26,
 'end': 238,
 'five': 208,
 'year': 1113,
 'coconut': 17,
 'production': 229,
 'cycle': 3,
 'showed': 43,
 'tended': 2,
 'fall': 106,
 'two': 368,
 'successive': 3,
 'good': 110,
 'harvest': 19,
 'lme': 18,
 'warehouse': 9,
 'stock': 455,
 'near': 38,
 'low': 69,
 'fallen': 11,
 'steadily': 4,
 'record': 152,
 'high': 123,
 'reached': 46,
 'february': 150,
 'many': 43,
 'major': 164,
 'nation': 81,
 'yesterday': 133,
 'intervened': 8,
 'heavily': 9,
 'aid

In [None]:
# prepare a dataframe
words_df = pd.DataFrame({'word':list(words_dict.keys()), 'count':list(words_dict.values())})

# sort words by their count in increasing order
words_df = words_df.sort_values(by = ['count'])

# reset dataframe index
words_df.reset_index(inplace = True, drop=True)

In [None]:
# words with least frequency
words_df.head()

Unnamed: 0,word,count
0,kuppenheimer,1
1,euromark,1
2,scarcely,1
3,unaltered,1
4,elia,1


In [None]:
# words with highest frequency
words_df.tail()

Unnamed: 0,word,count
10315,',1683
10316,dlrs,1704
10317,v,1849
10318,mln,2521
10319,said,3701


In [None]:
# vocabulary size
len(words_df)

10320

## 2. Creating N-grams of the dialogue

In [None]:
# creating an empty dataframe
dataset = pd.DataFrame()

# adding cleaned sentences in the dataframe
dataset['Sentences'] = train_sentences

# first 20 cleaned sentences
dataset.head(20)

Unnamed: 0,Sentences
0,tranche increased tonne tonne south korea adde...
1,showa denko export aluminium casting equipment...
2,note company ' full name gebrueder sulzer ag l...
3,romero said philippine end five year coconut p...
4,lme warehouse stock near two year low tonne fa...
5,many major nation yesterday intervened heavily...
6,transaction structured merger liberty duly for...
7,said ec failed follow consultation procedure l...
8,dollar fell low yen despite central bank inter...
9,george wimpey say benefit restructuring seen g...


In [None]:
# using .split() to get tokens from the sentence
dataset['Sentences'][0].split()

['tranche',
 'increased',
 'tonne',
 'tonne',
 'south',
 'korea',
 'added',
 'possible',
 'destination']

In [None]:
# function to create unigrams
# taking a sentence as input
def create_unigram(sentence):
    # creating tokens from the sentence
    tokens = sentence.split()
    # empty list to store the unigrams
    unigram_list = []
    # number of unigrams is equal to the number of tokens in the sentence
    for i in range(len(tokens)):
        # appending each unigram in the list
        unigram_list.append(tokens[i:i+1])
    # returning the unigram list for a sentence
    return unigram_list

In [None]:
# function to create bigrams
def create_bigram(sentence):
    tokens = sentence.split()
    bigram_list = []
    # number of bigrams is one less than the number of tokens in the sentence
    for i in range(len(tokens)-1):
        bigram_list.append(tokens[i:i+2])
    return bigram_list

In [None]:
# function to create trigrams
def create_trigram(sentence):
    tokens = sentence.split()
    trigram_list = []
    # number of trigrams is two less than the number of tokens in the sentence
    for i in range(len(tokens)-2):
        trigram_list.append(tokens[i:i+3])
    return trigram_list

In [None]:
# creating unigrams for all the sentences in the dataset
final_unigram = []
# for each sentence
for i in range(dataset.shape[0]):
    # using the defined unigram function to create unigrams
    final_unigram.append(create_unigram(dataset['Sentences'][i]))

# adding the unigram in a seperate column in the dataset
dataset['unigram'] = final_unigram

In [None]:
# creating bigrams for all the sentences in the dataset
final_bigram = []
for i in range(dataset.shape[0]):
    final_bigram.append(create_bigram(dataset['Sentences'][i]))

dataset['bigram'] = final_bigram

In [None]:
# creating trigrams for all the sentences in the dataset
final_trigram = []
for i in range(dataset.shape[0]):
    final_trigram.append(create_trigram(dataset['Sentences'][i]))

dataset['trigram'] = final_trigram

In [None]:
# first 20 rows of the dataset
dataset.head(20)

Unnamed: 0,Sentences,unigram,bigram
0,tranche increased tonne tonne south korea adde...,"[[tranche], [increased], [tonne], [tonne], [so...","[[tranche, increased], [increased, tonne], [to..."
1,showa denko export aluminium casting equipment...,"[[showa], [denko], [export], [aluminium], [cas...","[[showa, denko], [denko, export], [export, alu..."
2,note company ' full name gebrueder sulzer ag l...,"[[note], [company], ['], [full], [name], [gebr...","[[note, company], [company, '], [', full], [fu..."
3,romero said philippine end five year coconut p...,"[[romero], [said], [philippine], [end], [five]...","[[romero, said], [said, philippine], [philippi..."
4,lme warehouse stock near two year low tonne fa...,"[[lme], [warehouse], [stock], [near], [two], [...","[[lme, warehouse], [warehouse, stock], [stock,..."
5,many major nation yesterday intervened heavily...,"[[many], [major], [nation], [yesterday], [inte...","[[many, major], [major, nation], [nation, yest..."
6,transaction structured merger liberty duly for...,"[[transaction], [structured], [merger], [liber...","[[transaction, structured], [structured, merge..."
7,said ec failed follow consultation procedure l...,"[[said], [ec], [failed], [follow], [consultati...","[[said, ec], [ec, failed], [failed, follow], [..."
8,dollar fell low yen despite central bank inter...,"[[dollar], [fell], [low], [yen], [despite], [c...","[[dollar, fell], [fell, low], [low, yen], [yen..."
9,george wimpey say benefit restructuring seen g...,"[[george], [wimpey], [say], [benefit], [restru...","[[george, wimpey], [wimpey, say], [say, benefi..."


In [None]:
# sample sentence
dataset['Sentences'][0]

'tranche increased tonne tonne south korea added possible destination'

In [None]:
# unigram of the sentence
dataset['unigram'][0]

[['tranche'],
 ['increased'],
 ['tonne'],
 ['tonne'],
 ['south'],
 ['korea'],
 ['added'],
 ['possible'],
 ['destination']]

In [None]:
# bigram of the sentence
dataset['bigram'][0]

[['tranche', 'increased'],
 ['increased', 'tonne'],
 ['tonne', 'tonne'],
 ['tonne', 'south'],
 ['south', 'korea'],
 ['korea', 'added'],
 ['added', 'possible'],
 ['possible', 'destination']]

In [None]:
# trigram of the sentence
dataset['trigram'][0]

[['but', 'now', 'both'],
 ['now', 'both', 'tranches'],
 ['both', 'tranches', 'have'],
 ['tranches', 'have', 'been'],
 ['have', 'been', 'increased'],
 ['been', 'increased', 'by'],
 ['increased', 'by', 'tonnes'],
 ['by', 'tonnes', 'to'],
 ['tonnes', 'to', 'tonnes'],
 ['to', 'tonnes', 'with'],
 ['tonnes', 'with', 'south'],
 ['with', 'south', 'korea'],
 ['south', 'korea', 'added'],
 ['korea', 'added', 'as'],
 ['added', 'as', 'a'],
 ['as', 'a', 'possible'],
 ['a', 'possible', 'destination']]

## 3. Building the N-gram Language Model

In [None]:
# for defining the N-gram model
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance
for i in range(dataset.shape[0]):
    # for each trigram pair
    for w1, w2, w3 in create_trigram(dataset['Sentences'][i]):
        # count the occurance of word 3, given word 1 and word 2
        model[(w1, w2)][w3] += 1


In [None]:
# defined model
model

defaultdict(<function __main__.<lambda>()>,
            {('tranche',
              'increased'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'tonne': 1}),
             ('increased',
              'tonne'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'tonne': 1,
                          'department': 1}),
             ('tonne',
              'tonne'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'south': 1,
                          'year': 1,
                          'previous': 1,
                          'tin': 1,
                          'higher': 1}),
             ('tonne',
              'south'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'korea': 1,
                          'african': 2}),
             ('south',
              'korea'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>, {'added': 1,
                          'reagan': 1,
                          'reliant': 1,
             

## 4. Predicting the next word using N-gram Language Model

In [None]:
# predict the next word
dict(model["by", "an"])

{}

In [None]:
# another example
dict(model["oil", "supply"])

{'pct': 1,
 'line': 1,
 'group': 1,
 'bi': 1,
 'demand': 1,
 'jeapordy': 1,
 'total': 1,
 'key': 1}

In [None]:
# another example
dict(model["increasing", "the"])

{}

In [None]:
# another example
dict(model["a", "step"])

{}

### Probabilistic Output

In [None]:
# creating the unigram list
unigram_dict = {}
for i in tqdm(range(dataset.shape[0])):
    # add word-count pair to the dictionary
    for word in dataset['unigram'][i]:
        # check if the word is already in dictionary
        if word[0] in unigram_dict:
            # increment count of word by 1
            unigram_dict[word[0]] = unigram_dict[word[0]] + 1
        else:
            # add the word to dictionary with count 1
            unigram_dict[word[0]] = 1

100%|██████████| 8000/8000 [00:00<00:00, 46338.86it/s]


In [None]:
# unigram list
unigram_dict

{'but': 524,
 'now': 140,
 'both': 70,
 'tranches': 2,
 'have': 443,
 'been': 306,
 'increased': 59,
 'by': 1110,
 'tonnes': 443,
 'to': 5099,
 'with': 913,
 'south': 100,
 'korea': 33,
 'added': 188,
 'as': 707,
 'a': 3561,
 'possible': 60,
 'destination': 6,
 'showa': 5,
 'denko': 3,
 'exports': 222,
 'aluminium': 19,
 'casting': 2,
 'equipment': 34,
 'lt': 1200,
 'ltd': 212,
 'said': 3701,
 'it': 1577,
 'is': 1132,
 'exporting': 9,
 'billet': 1,
 'and': 3634,
 'technology': 33,
 'countries': 114,
 'that': 1103,
 'recently': 35,
 'begun': 6,
 'smelting': 5,
 'note': 209,
 'company': 681,
 "'": 1683,
 's': 2261,
 'full': 69,
 'name': 44,
 'gebrueder': 2,
 'sulzer': 2,
 'ag': 12,
 'sulz': 1,
 'z': 9,
 'romero': 8,
 'the': 10053,
 'philippines': 17,
 'was': 919,
 'at': 1063,
 'end': 224,
 'of': 5360,
 'five': 208,
 'year': 990,
 'coconut': 15,
 'production': 227,
 'cycle': 2,
 'which': 521,
 'showed': 43,
 'tended': 2,
 'fall': 88,
 'after': 299,
 'two': 368,
 'successive': 3,
 'years':

In [None]:
# find the overall frequency of words in the corpus
counts = Counter(unigram_dict)
counts

Counter({'tranche': 4,
         'increased': 59,
         'tonne': 503,
         'south': 100,
         'korea': 33,
         'added': 188,
         'possible': 60,
         'destination': 14,
         'showa': 5,
         'denko': 3,
         'export': 343,
         'aluminium': 19,
         'casting': 2,
         'equipment': 34,
         'lt': 1200,
         'ltd': 212,
         'said': 3701,
         'exporting': 9,
         'billet': 1,
         'technology': 49,
         'country': 194,
         'recently': 35,
         'begun': 6,
         'smelting': 5,
         'note': 232,
         'company': 840,
         "'": 1683,
         'full': 69,
         'name': 47,
         'gebrueder': 2,
         'sulzer': 2,
         'ag': 12,
         'sulz': 1,
         'z': 9,
         'romero': 8,
         'philippine': 26,
         'end': 238,
         'five': 208,
         'year': 1113,
         'coconut': 17,
         'production': 229,
         'cycle': 3,
         'showed': 43,
         

In [None]:
# vocabulary size
total_count = len(unigram_dict)
total_count

10320

In [None]:
# relative frequencies of each word
for word in counts:
    counts[word] /= float(total_count)

counts

Counter({'but': 0.04544666088464874,
         'now': 0.012142237640936688,
         'both': 0.006071118820468344,
         'tranches': 0.00017346053772766696,
         'have': 0.03842150910667823,
         'been': 0.026539462272333045,
         'increased': 0.0051170858629661755,
         'by': 0.09627059843885516,
         'tonnes': 0.03842150910667823,
         'to': 0.4422376409366869,
         'with': 0.07918473547267997,
         'south': 0.008673026886383347,
         'korea': 0.002862098872506505,
         'added': 0.016305290546400692,
         'as': 0.06131830008673027,
         'a': 0.308846487424111,
         'possible': 0.005203816131830009,
         'destination': 0.0005203816131830009,
         'showa': 0.0004336513443191674,
         'denko': 0.00026019080659150045,
         'exports': 0.01925411968777103,
         'aluminium': 0.001647875108412836,
         'casting': 0.00017346053772766696,
         'equipment': 0.002948829141370338,
         'lt': 0.10407632263660017,

In [None]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count


In [None]:
# predict the next word
dict(model["oil", "supply"])

{'pct': 0.125,
 'line': 0.125,
 'group': 0.125,
 'bi': 0.125,
 'demand': 0.125,
 'jeapordy': 0.125,
 'total': 0.125,
 'key': 0.125}

In [None]:
# another example
dict(model["increase", "the"])

{}

In [None]:
# another example
dict(model["a", "step"])

{}

In [None]:
# another example
dict(model["last", "year"])

{'five': 0.00641025641025641,
 'fend': 0.00641025641025641,
 "'": 0.16025641025641027,
 'bag': 0.00641025641025641,
 'index': 0.00641025641025641,
 'billion': 0.038461538461538464,
 'west': 0.00641025641025641,
 'offer': 0.00641025641025641,
 'processing': 0.02564102564102564,
 'necessarily': 0.00641025641025641,
 'temporary': 0.00641025641025641,
 'tokyo': 0.019230769230769232,
 'imbalance': 0.00641025641025641,
 'domestic': 0.01282051282051282,
 'rise': 0.00641025641025641,
 'cpc': 0.00641025641025641,
 'first': 0.00641025641025641,
 'national': 0.00641025641025641,
 'predicted': 0.00641025641025641,
 'expects': 0.01282051282051282,
 'merchant': 0.00641025641025641,
 'higher': 0.00641025641025641,
 'earned': 0.01282051282051282,
 'record': 0.00641025641025641,
 'said': 0.038461538461538464,
 'april': 0.00641025641025641,
 'ended': 0.00641025641025641,
 'allegis': 0.00641025641025641,
 'compared': 0.00641025641025641,
 'paid': 0.01282051282051282,
 'bank': 0.01282051282051282,
 'diffi

In [None]:
#calsulate the perplexity of the model using the training dataset
import math

def calculate_perplexity(dataset, trigram_model):
    log_prob_sum = 0
    total_words = 0

    for sentence in dataset['Sentences']:
        trigram_list = create_trigram(sentence)

        for w1, w2, w3 in trigram_list:
            # Get the probability of word w3 given the bigram (w1, w2)
            if trigram_model[(w1, w2)]:
                prob = trigram_model[(w1, w2)].get(w3, 0)
                if prob > 0:
                    log_prob_sum += math.log2(prob)
                    total_words += 1

    # Calculate the cross-entropy
    if total_words > 0:
        cross_entropy = -log_prob_sum / total_words
        # Calculate perplexity
        perplexity = math.pow(2, cross_entropy)
        return perplexity
    else:
        return float('inf')


perplexity = calculate_perplexity(dataset, model)
print(f"Perplexity: {perplexity}")

Perplexity: 3.198650811720578


In [None]:
##calculate the perplexity of the model on the testing dataset
import math

def calculate_perplexity(dataset, trigram_model):
    log_prob_sum = 0
    total_words = 0

    for sentence in dataset:
        trigram_list = create_trigram(sentence)

        for w1, w2, w3 in trigram_list:
            # Get the probability of word w3 given the bigram (w1, w2)
            if trigram_model[(w1, w2)]:
                prob = trigram_model[(w1, w2)].get(w3, 0)
                if prob > 0:
                    log_prob_sum += math.log2(prob)
                    total_words += 1

    # Calculate the cross-entropy
    if total_words > 0:
        cross_entropy = -log_prob_sum / total_words
        # Calculate perplexity
        perplexity = math.pow(2, cross_entropy)
        return perplexity
    else:
        return float('inf')


perplexity = calculate_perplexity(test_sentences, model)
print(f"Perplexity: {perplexity}")

Perplexity: 4.3811406319724435
