Importing libraries

In [1]:
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from typing import Iterator
import nltk
import string
from nltk.corpus import stopwords
nltk.download('punkt')
from collections import defaultdict
from collections import Counter
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Part 1: Unsmoothed Unigrams and Bigrams

In [84]:
def preprocess_text(sentence):
    remove_punctuation = sentence.translate(str.maketrans('', '', string.punctuation))
    word_tokens = word_tokenize(remove_punctuation)
    clean = []
    stop_words = set(stopwords.words('english'))
    for word in word_tokens:
        clean.append(word)
    return ' '.join(w for w in clean)

def n_grams(s,n,i=0):
    while(len(s[i:i+n])==n):
        yield s[i:i+n]
        i+=1

def unigram_probabilities(unigram, word_frequency_dict, total_vocab):
    unigram_prob={}
    for sentence in unigram:
        for word in sentence:
            unigram_prob[word[0]]=word_frequency_dict[word[0]]/total_vocab
    return unigram_prob


def bigram_probabilites(bigram, word_frequency_dict, bigram_dict):
    bigram_prob={}
    for sentence in bigram:
        for pair in sentence:
          count_pair=bigram_dict[(pair[0],pair[1])]
          if(pair[0] in word_frequency_dict):
              bigram_prob[(pair[0],pair[1])]=count_pair/word_frequency_dict[pair[0]]
          else:
              bigram_prob[(pair[0],pair[1])] = 0
    return bigram_prob

with open('train.txt', 'r', encoding='utf-8') as file:
    corpus = file.read().lower()
sentences = corpus.split("\n")
bigrams = []
unigrams=[]
for sentence in sentences:
    preprocessed_sentence = preprocess_text(sentence)
    bigrams.append(list(n_grams(preprocessed_sentence.split(), 2)))
    unigrams.append(list(n_grams(preprocessed_sentence.split(), 1)))

word_frequency_dict = Counter(corpus.split())
total_vocab=0
unique_unigrams = set({})
for sentence in unigrams:
    for word in sentence:
        unique_unigrams.add(word[0])
for word in unique_unigrams:
    total_vocab += word_frequency_dict[word]
bigram_dict={}
for sentence in bigrams:
    for pair in sentence:
        if((pair[0],pair[1]) in bigram_dict):
            bigram_dict[(pair[0],pair[1])]+=1
        else:
            bigram_dict[(pair[0],pair[1])]=1

unigram_prob=unigram_probabilities(unigrams,word_frequency_dict, total_vocab)
bigram_prob=bigram_probabilites(bigrams,word_frequency_dict, bigram_dict)

Part 2: Handling unknown words

In [81]:
sorted_word_freq={k: v for k, v in sorted(word_frequency_dict.items(), key=lambda item: item[1], reverse = True)}
count=0
unknown_words_train=[]
new_sorted_dict={}
for key,value in sorted_word_freq.items():
    if key in unique_unigrams:
        if value<2:
            count+=1
            unknown_words_train.append(key)
        else:
            new_sorted_dict[key]=value

new_sorted_dict['<UNK>']=count

#calculate probabilities after handling unknown words
def new_unigram_probabilities(unigram, word_frequency_dict, total_vocab, new_sorted_dict):
    unigram_prob={}
    for sentence in unigram:
        for word in sentence:
            if word[0] not in unknown_words_train:
                if(word[0] in word_frequency_dict):
                    unigram_prob[word[0]] = word_frequency_dict[word[0]]/total_vocab
                else:
                    unigram_prob[word[0]] = 0
            else:
                unigram_prob[word[0]]=new_sorted_dict['<UNK>']/total_vocab
    return unigram_prob

new_unigram_prob= new_unigram_probabilities(unigrams,new_sorted_dict,total_vocab,new_sorted_dict)

#smoothing after handling unknown words and re-calulating probabilities
def new_smooth_unigram_probabilities(unigram, word_frequency_dict, total_vocab, new_sorted_dict,smoothing_factor):
    unigram_prob={}
    for sentence in unigram:
        for word in sentence:
            if word[0] not in unknown_words_train:
                if(word[0] in word_frequency_dict):
                    unigram_prob[word[0]]=(word_frequency_dict[word[0]]+smoothing_factor)/(total_vocab*(smoothing_factor+1))
                else:
                    unigram_prob[word[0]]=(smoothing_factor)/(total_vocab*(smoothing_factor+1))
            else:
                unigram_prob[word[0]]=(new_sorted_dict['<UNK>']+smoothing_factor)/(total_vocab*(smoothing_factor+1))
    return unigram_prob

smooth_unigram_prob= new_smooth_unigram_probabilities(unigrams,new_sorted_dict,total_vocab,new_sorted_dict,5) # hyper parameter is set for unigrams


Replacing the words with frequency 1 in the corpus with ```'<UNK>'``` and creating a new file 'new_train.txt'

In [47]:
#create new text file with unknown words so that bigram calculations become easier
preprocessed_updated_content = []
with open('train.txt', 'r', encoding='utf-8') as file:
    new_file=file.read()
    sentences = new_file.split("\n")
    bigrams = []
    for sentence in sentences:
        preprocessed_sentence = preprocess_text(sentence)
        bigrams.append(list(n_grams(preprocessed_sentence.split(), 2)))
        for word in unknown_words_train:
            preprocessed_sentence.replace(word,'<UNK>')
        preprocessed_updated_content.append(preprocessed_sentence)

with open('new_train.txt', 'w', encoding='utf-8') as file:
    for new_sentence in preprocessed_updated_content:
        file.write(new_sentence)
        file.write("\n")

Calculating bigram probabilites after unknown handling

In [49]:
with open('new_train.txt','r',encoding='utf-8') as file:
    new_corpus=file.read()
#creating bigrams for each review
train_sentences=new_corpus.split('\n')

new_bigrams= n_grams(new_corpus.split(),2)
new_bigrams = [(pair[0],pair[1]) for pair in new_bigrams]
new_bigram_dict={}
for pair in new_bigrams:
    if((pair[0],pair[1]) in new_bigram_dict):
        new_bigram_dict[(pair[0],pair[1])]+=1
    else:
        new_bigram_dict[(pair[0],pair[1])]=1

new_bigram_prob={}
for pair in new_bigrams:
    count_pair=new_bigram_dict[pair]
    if (pair[0] in new_sorted_dict):
        new_bigram_prob[pair]=(count_pair)/(new_sorted_dict[pair[0]])
    else:
        new_bigram_prob[pair]=(count_pair)/(new_sorted_dict['<UNK>'])


Smoothed bigram probabilites

In [79]:
smooth_bigram_prob={}
factor = 5 #hyper-parameter
for pair in new_bigrams:
    count_pair=new_bigram_dict[pair]
    if (pair[0] in new_sorted_dict):
        smooth_bigram_prob[pair]=(count_pair+factor)/(new_sorted_dict[pair[0]]+factor*total_vocab)
    else:
        smooth_bigram_prob[pair]=(count_pair+factor)/(new_sorted_dict['<UNK>']+factor*total_vocab)

In [55]:
import math
total_vocab_val=0
with open('val.txt', 'r', encoding='utf-8') as file:
    valtxt = str(file.read().lower())
valtxt = preprocess_text(valtxt)
word_frequency_dict_val = Counter(valtxt.split())
total_vocab_val=sum(word_frequency_dict_val.values())
unique_vocab=len(word_frequency_dict_val)

Unigram Perplexity

In [82]:
total_log_unigram = 0
sentence_unigram = n_grams(valtxt.split(), 1)
log_sum_unigram = 0
for word in sentence_unigram:
    if word[0] in smooth_unigram_prob:
        if unigram_prob[word[0]]!=0:
            log_sum_unigram+=math.log(smooth_unigram_prob[word[0]])
        else:
            log_sum_unigram+=math.log(count/total_vocab)
    else:
        log_sum_unigram+=math.log(count/total_vocab)
total_log_unigram += log_sum_unigram

total_log_unigram *= -1/total_vocab_val
total_perplexity_unigram = math.exp(total_log_unigram)
print("Perplexity:", total_perplexity_unigram)

Perplexity: 1624.0819671836339


Bigram Perplexity

In [80]:
total_perplexity = 0
total_log = 0
sentence_bigram = n_grams(valtxt.split(), 2)
log_sum = 0
for pairs in sentence_bigram:
  if tuple(pairs) in smooth_bigram_prob:
      if bigram_prob[tuple(pairs)]!=0:
          log_sum+=math.log(smooth_bigram_prob[tuple(pairs)])
      else:
          log_sum+=math.log(count/total_vocab)
  else:
      log_sum+=math.log(count/total_vocab)
total_log += log_sum

total_log *= -1/total_vocab_val
total_perplexity = math.exp(total_log)
print("Perplexity:", total_perplexity)

Perplexity: 1418.5965695615337


In [None]:
1