In [76]:
import numpy as np
import pandas as pd
import nltk
import re
from collections import defaultdict

In [91]:
n = 3
min_freq = 2
nltk.download('punkt')
text_path = '/Users/djemec/data/wiki/clemson_football.txt'
text_file = open(text_path, 'r')
text = text_file.read()
text_file.close()

[nltk_data] Downloading package punkt to /Users/djemec/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [113]:
t_proc = text.replace( '\n','').lower()
t_proc = re.sub(r"[^a-zA-Z0-9.?! ]+", "", t_proc)
t_proc = [i.strip() for i in t_proc.split(".") if i != '']
t_proc

['the clemson tigers are the american football team at clemson university',
 'the tigers compete in the ncaa division i football bowl subdivision fbs of the national collegiate athletic association ncaa and the atlantic division of the atlantic coast conference acc',
 'in recent years the tigers have been ranked among the most elite college football programs in the united states',
 'formed in 1896 the program has over 750 wins and three consensus national championships in the modern era',
 'clemson was a college football playoff finalist in 2015 2016 2018 and 2019 winning the championship game over alabama in 2016 and 2018',
 'clemson has had six undefeated seasons six consecutive playoff appearances 26 conference championships and eight divisional titles',
 'its alumni includes over 100 allamericans 17 academic allamericans and over 250 players in the national football league',
 'clemson has had seven members inducted into the college football hall of fame players banks mcfadden terry

In [110]:
l_1 = ['hello','how','are']
l_2 = ['hi','how','you']
tuple([x if x in l_1 else 'unk' for x in l_2 ])

('unk', 'how', 'unk')

In [129]:
def corpus_to_ngram_ct_mat(s_list, n=3, min_freq=2):
    """
    Creates the n-gram count matrix from the input corpus in a single pass through the corpus.
    
    Args:
        corpus: Pre-processed and tokenized corpus. 
    
    Returns:
        bigrams: list of all bigram prefixes, row index
        vocabulary: list of all found words, the column index
        count_matrix: pandas dataframe with bigram prefixes as rows, 
                      vocabulary words as columns 
                      and the counts of the bigram/word combinations (i.e. trigrams) as values
    """
    start = '<S>'
    end = '</S>'
    unknown = '<UNK>'
    pregrams = []
    vocabulary = []
    count_matrix_dict = defaultdict(dict)
    # set vocabulary where must appear at least min_freq
    words = re.sub(r"[^a-zA-Z0-9' ]+", "", ' '.join(s_list)).split()
    vocabulary = [i for i in set(words) if words.count(i) >= min_freq] + [unknown]
    vocabulary = vocabulary + [start] + [end]
    
    # process sentence by sentence
    for k,s in enumerate(s_list):
        if k%10 == 0:
            print(f'processing sentence {k}/{len(s_list)}') 
            
        ts = nltk.word_tokenize(s)
        # adds in the start and end
        ts = [start for i in range(n-1)] + ts + [end]
        
        # note that the last position of i is Nth to the end
        for i in range(len(ts) - n + 1):
            # the sliding window starts at position i and contains 3 words
            ngram = tuple(ts[i : i + n])
            
            # replace words not in vocab with unknown
            ngram = tuple([x if x in vocabulary else unknown for x in ngram])
            
            pgram = ngram[0 : -1]
            last_word = ngram[-1]
            
            # add pgram to pregrams (n-1 grams)
            if not pgram in pregrams:
                pregrams.append(pgram)

            if (pgram,last_word) not in count_matrix_dict:
                count_matrix_dict[pgram,last_word] = 0

            count_matrix_dict[pgram,last_word] += 1
    
    # convert the count_matrix to np.array to fill in the blanks
    count_matrix = np.zeros((len(pregrams), len(vocabulary)))
    for ngram_key, ngram_count in count_matrix_dict.items():
        count_matrix[pregrams.index(ngram_key[0]), vocabulary.index(ngram_key[1])] = ngram_count
    
    # np.array to pandas dataframe conversion
    count_matrix = pd.DataFrame(count_matrix, index=pregrams, columns=vocabulary)
    return pregrams, vocabulary, count_matrix

In [117]:
pgrams, vocabulary, count_matrix = corpus_to_ngram_ct_mat(t_proc, n)

count_matrix.head()

processing sentence 0/280
processing sentence 10/280
processing sentence 20/280
processing sentence 30/280
processing sentence 40/280
processing sentence 50/280
processing sentence 60/280
processing sentence 70/280
processing sentence 80/280
processing sentence 90/280
processing sentence 100/280
processing sentence 110/280
processing sentence 120/280
processing sentence 130/280
processing sentence 140/280
processing sentence 150/280
processing sentence 160/280
processing sentence 170/280
processing sentence 180/280
processing sentence 190/280
processing sentence 200/280
processing sentence 210/280
processing sentence 220/280
processing sentence 230/280
processing sentence 240/280
processing sentence 250/280
processing sentence 260/280
processing sentence 270/280


Unnamed: 0,o,retirement,united,valley,150,gave,what,k,4,allamerican,...,finish,previously,2020,wake,up,multiple,1991,<unk>,<s>,</s>
"(<s>, <s>)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0
"(<s>, the)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0
"(the, clemson)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
"(clemson, tigers)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(tigers, are)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
# create the probability matrix from the count matrix and include smoothing
ct_tmp = count_matrix.copy() +1
row_sums = ct_tmp.sum(axis=1)
# delete each row by its sum
prob_matrix = ct_tmp.div(row_sums, axis=0)
prob_matrix.head()

Unnamed: 0,o,retirement,united,valley,150,gave,what,k,4,allamerican,...,finish,previously,2020,wake,up,multiple,1991,<unk>,<s>,</s>
"(<s>, <s>)",0.002132,0.001066,0.001066,0.001066,0.001066,0.001066,0.001066,0.003198,0.003198,0.001066,...,0.001066,0.001066,0.001066,0.001066,0.001066,0.001066,0.001066,0.026652,0.001066,0.001066
"(<s>, the)",0.00141,0.00141,0.00141,0.00141,0.00141,0.00141,0.00141,0.00141,0.00141,0.00141,...,0.00141,0.00141,0.00141,0.00141,0.00141,0.00141,0.00141,0.012694,0.00141,0.00141
"(the, clemson)",0.001502,0.001502,0.001502,0.001502,0.001502,0.001502,0.001502,0.001502,0.001502,0.001502,...,0.001502,0.001502,0.001502,0.001502,0.001502,0.001502,0.001502,0.004505,0.001502,0.001502
"(clemson, tigers)",0.001513,0.001513,0.001513,0.001513,0.001513,0.001513,0.001513,0.001513,0.001513,0.001513,...,0.001513,0.001513,0.001513,0.001513,0.001513,0.001513,0.001513,0.001513,0.001513,0.001513
"(tigers, are)",0.001517,0.001517,0.001517,0.001517,0.001517,0.001517,0.001517,0.001517,0.001517,0.001517,...,0.001517,0.001517,0.001517,0.001517,0.001517,0.001517,0.001517,0.001517,0.001517,0.001517
