In [1]:
import pandas as pd
import numpy as np
import math
import random
import nltk
nltk.data.path.append('.')

In [2]:
with open("en_US.twitter.txt", "r",encoding="utf-8") as f:
    data1= f.read()

In [3]:
with open("textdata.txt", "r") as f1:
    data2 = f1.read()

In [4]:
def split_to_sentences(data):
    """
    Split data by linebreak "\n"
    Args:
        data: str
    Returns:
        A list of sentences
    """
    sentences = data.split("\n")
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    return sentences   

In [5]:
def split_to_sentences1(data):
    """
    Split data by "." "\n"
    Args:
        data: str
    Returns:
        A list of sentences
    """
    sentences = data.split(".")
    sentences=[s+"." for s in sentences]
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    return sentences    

In [6]:
def tokenize_sentences(sentences):
    """
    Tokenize sentences into tokens (words)
    
    Args:
        sentences: List of strings
    
    Returns:
        List of lists of tokens
    """
    tokenized_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokenized = nltk.word_tokenize(sentence)
        tokenized_sentences.append(tokenized)
    return tokenized_sentences

In [7]:
def get_tokenized_data(data,param=0):
    """
    Make a list of tokenized sentences
    
    Args:
        data: String
        param: 0 for first datasets 1 for second
    
    Returns:
        List of lists of tokens
    """
    if param==0:
        sentences = split_to_sentences(data)
    if param==1:
        sentences = split_to_sentences1(data)
    tokenized_sentences = tokenize_sentences(sentences)
    return tokenized_sentences

In [10]:
#data=data1+data2
tokenized_data1 = get_tokenized_data(data1,0)
tokenized_data2 = get_tokenized_data(data2,1)
tokenized_data=tokenized_data1+tokenized_data2
random.seed(101)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [11]:
print("---------------DATASET SUMMARY------------------------------------")
print("{} data are split into {} train and {} test set".format(
    len(tokenized_data), len(train_data), len(test_data)))
print("First training sample:")
print(train_data[0])
      
print("First test sample")
print(test_data[0])

---------------DATASET SUMMARY------------------------------------
465470 data are split into 372376 train and 93094 test set
First training sample:
['so', 'thats', 'when', 'i', 'started', 'reading', 'everything', 'i', 'could', 'about', 'psychoactive', 'drugs', 'the', 'history', 'the', 'science', 'the', 'politics', 'all', 'of', 'it', 'and', 'the', 'more', 'one', 'read', 'the', 'more', 'it', 'hit', 'you', 'how', 'a', 'thoughtful', 'enlightened', 'intelligent', 'approach', 'took', 'you', 'over', 'here', 'whereas', 'the', 'politics', 'and', 'laws', 'of', 'my', 'country', 'were', 'taking', 'you', 'over', 'here', '.']
First test sample
['and', 'this', 'building', 'was', 'made', 'platinum', 'leed', '.']


In [12]:
def count_words(tokenized_sentences):
    """
    Count the number of word appearence in the tokenized sentences
    
    Args:
        tokenized_sentences: List of lists of strings
    
    Returns:
        dict that maps word (str) to the frequency (int)
    """
        
    word_counts = {}
    for sentence in tokenized_sentences:
        for token in sentence: 
            if token not in word_counts.keys(): 
                word_counts[token] = 1
            else:
                word_counts[token] += 1
    return word_counts

In [13]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    """
    Find the words that appear N times or more
    
    Args:
        tokenized_sentences: List of lists of sentences
        count_threshold: minimum number of occurrences for a word to be in the closed vocabulary.
    
    Returns:
        List of words that appear N times or more
    """
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)
    for word, cnt in word_counts.items():
        if cnt>=count_threshold:
            closed_vocab.append(word)
    return closed_vocab

In [14]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    """
    Replace words not in the given vocabulary with '<unk>' token.
    
    Args:
        tokenized_sentences: List of lists of strings
        vocabulary: List of strings that we will use
        unknown_token: A string representing unknown (out-of-vocabulary) words
    
    Returns:
        List of lists of strings, with words not in the vocabulary replaced
    """
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []
    for sentence in tokenized_sentences:
        replaced_sentence = []
        for token in sentence: 
            if token in vocabulary: 
                replaced_sentence.append(token)
            else:
                replaced_sentence.append(unknown_token)
        replaced_tokenized_sentences.append(replaced_sentence)   
    return replaced_tokenized_sentences

In [15]:
def preprocess_data(train_data, test_data, count_threshold):
    """
    Preprocess data, i.e.,
        - Find tokens that appear at least N times in the training data.
        - Replace tokens that appear less than N times by "<unk>" both for training and test data.        
    Args:
        train_data, test_data: List of lists of strings.
        count_threshold: Words whose count is less than this are 
                      treated as unknown.
    
    Returns:
        Tuple of
        - training data with low frequent words replaced by "<unk>"
        - test data with low frequent words replaced by "<unk>"
        - vocabulary of words that appear n times or more in the training data
    """
    vocabulary = get_words_with_nplus_frequency(train_data, count_threshold)
    train_data_replaced = replace_oov_words_by_unk(train_data, vocabulary)
    test_data_replaced = replace_oov_words_by_unk(test_data, vocabulary)
    return train_data_replaced, test_data_replaced, vocabulary

In [16]:
min_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data,test_data,min_freq)

In [17]:
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):
    """
    Count all n-grams in the data
    
    Args:
        data: List of lists of words
        n: number of words in a sequence
    
    Returns:
        A dictionary that maps a tuple of n-words to its frequency
    """
    n_grams = {}
    for sentence in data: 
        sentence = [start_token]*n+sentence+[end_token]
        sentence = tuple(sentence)
        for i in range(len(sentence)-n+1): 
            n_gram = sentence[i:i+n]
            if n_gram in n_grams:
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
    return n_grams

In [18]:
def estimate_probability(word, previous_n_gram, 
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing
    
    Args:
        word: next word
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary_size: number of words in the vocabulary
        k: positive constant, smoothing parameter
    
    Returns:
        A probability
    """
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram,0)
    denominator = previous_n_gram_count+k*abs(vocabulary_size)
    n_plus1_gram = previous_n_gram+(word,)
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram,0)
    numerator = n_plus1_gram_count+k
    probability = numerator/denominator
    return probability

In [19]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    """
    Estimate the probabilities of next words using the n-gram counts with k-smoothing
    
    Args:
        previous_n_gram: A sequence of words of length n
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary: List of words
        k: positive constant, smoothing parameter
    
    Returns:
        A dictionary mapping from next words to the probability.
    """
    previous_n_gram = tuple(previous_n_gram)
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, 
                                           n_gram_counts, n_plus1_gram_counts, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

In [None]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):
    vocabulary = vocabulary + ["<e>", "<unk>"]
    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    col_index = {word:j for j, word in enumerate(vocabulary)}
    nrow = len(n_grams)
    ncol = len(vocabulary)
    count_matrix = np.zeros((nrow, ncol))
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix

In [20]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts, unique_words)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

In [21]:
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    """
    Calculate perplexity for a list of sentences
    
    Args:
        sentence: List of strings
        n_gram_counts: Dictionary of counts of (n+1)-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary_size: number of unique words in the vocabulary
        k: Positive smoothing constant
    
    Returns:
        Perplexity score
    """
    n = len(list(n_gram_counts.keys())[0]) 
    sentence = ["<s>"] * n + sentence + ["<e>"]
    sentence = tuple(sentence)
    N = len(sentence)
    product_pi = 1.0
    for t in range(n, N): 
        n_gram =sentence[t-n:t]
        word = sentence[t]
        probability = estimate_probability(word,n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1)
        product_pi *= 1 / probability
    perplexity = product_pi**(1/float(N))
    return perplexity

In [22]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    """
    Get suggestion for the next word
    
    Args:
        previous_tokens: The sentence you input where each token is a word. Must have length > n 
        n_gram_counts: Dictionary of counts of n-grams
        n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
        vocabulary: List of words
        k: positive constant, smoothing parameter
        start_with: If not None, specifies the first few letters of the next word
        
    Returns:
        A tuple of 
          - string of the most likely next word
          - corresponding probability
    """
    n = len(list(n_gram_counts.keys())[0]) 
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
    suggestion = None
    max_prob = 0
    for word, prob in probabilities.items(): 
        if start_with: 
            if start_with not in word: 
                continue
        if prob>max_prob: 
            suggestion = word
            max_prob = prob
    return suggestion, max_prob

In [23]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [24]:
n_gram_counts_list = []
for n in range(1,10 ):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...
Computing n-gram counts with n = 6 ...
Computing n-gram counts with n = 7 ...
Computing n-gram counts with n = 8 ...
Computing n-gram counts with n = 9 ...


In [29]:
previous_tokens = ["How","are","you","doing"]
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list[1:5], vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
print(tmp_suggest4[0][0])

The previous words are ['How', 'are', 'you', 'doing'], the suggestions are:
?


In [31]:
print(tmp_suggest4)

[('?', 0.0007957190316099385), ('?', 0.0006368539415288474), ('so', 1.9933819718534465e-05)]


In [65]:

a = ["ashik","muna","humayara","tasneem"]

      
    # Its important to use binary mode
f112 = open('test', 'ab')
      
    # source, destination
pickle.dump(a,f112)                     
#dbfile.close()

In [66]:
f113 = open('test', 'rb')     
b = pickle.load(f113)
print(b)
#dbfile.close()

['ashik', 'muna', 'humayara', 'tasneem']


In [None]:
vocabfile=open('vocabulary','ab')
pickle.dump(vocabulary,vocabfile)
nglistfile=open('nglistcount','ab')
pickle.dump(n_gram_counts_list,nglistfile)