<a href="https://colab.research.google.com/github/fuat-arslan/NLP_Course/blob/main/Code_Final_NLP_Assignment3_Collocations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""NLP_ASS3v5_final.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1nEM3ad9fSTR7cAomsUBSeoRF3Kf2XVhR
"""

import os
from google.colab import files
if 'Fyodor Dostoyevski Processed.txt' not in os.listdir():
    print('Please upload the document Fyodor Dostoyevski Processed.txt')
    uploaded = files.upload()

import nltk
nltk.download('punkt') #required for tokenizer
nltk.download('averaged_perceptron_tagger') #required for pos_tag
nltk.download('universal_tagset')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import numpy as np
from collections import Counter
import math
import pandas as pd
pd.set_option('display.float_format', '{:.6f}'.format)


# Load the stopwords
stopwords_list = stopwords.words('english')

"""Read the txt file and tokenize it with nltk"""

with open('/content/Fyodor Dostoyevski Processed.txt', 'r') as f:
    text = f.read()

tokens = nltk.tokenize.word_tokenize(text)

print('number of tokens', np.array(tokens).shape) #total number of tokens

"""POS tags"""

pos_tags = nltk.tag.pos_tag(tokens, tagset='universal')

#Example tags
print('Example postags', pos_tags[0:10])

"""Custom Lemantizer class"""

class custom_lemmatizer:

    tag_dict = {
        "ADJ": wordnet.ADJ,
        "NOUN": wordnet.NOUN,
        # "VERB": wordnet.VERB,
        # "ADV": wordnet.ADV
    }

    lemmatizer = WordNetLemmatizer()

    #Retured value is lower case.
    def lemmatize(self, word_pos_tuple):
        word = word_pos_tuple[0]
        pos_tag = word_pos_tuple[1]
        if pos_tag in self.tag_dict:
            return self.lemmatizer.lemmatize(word, self.tag_dict[pos_tag]).lower()
        else:
            return word.lower()

lemmatizer = custom_lemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(pos_tag) for pos_tag in pos_tags]

#obliged mapped to oblige @100
print('example lemmatized vs normal:', lemmatized_tokens[100:109], tokens[100:109])



"""#Candidate Collocation Generator



"""

def filter_bigrams(lemmatized_tokens, pos_tags, window_size,threshold):
    filtered_bigrams = []
    bigrams = []
    for i in range(len(lemmatized_tokens) - 1):
        for j in range(window_size):
            try:
                current_word = lemmatized_tokens[i]
                next_word = lemmatized_tokens[i+j + 1]
                current_pos_tag = pos_tags[i][1]
                next_pos_tag = pos_tags[i+j + 1][1]
                #print(current_word,next_word,current_pos_tag,next_pos_tag)

                # Check if both words consist only of alphabetical characters and are not stopwords
                if current_word.isalpha() and next_word.isalpha() and current_word not in stopwords_list and next_word not in stopwords_list:
                    if (current_pos_tag == 'NOUN' and next_pos_tag == 'NOUN') or (current_pos_tag == 'ADJ' and next_pos_tag == 'NOUN'):
                        filtered_bigrams.append((current_word, next_word))
            except:
                pass


    # bigram_counts = Counter(filtered_bigrams)
    # #print(filtered_bigrams)
    # for bigram, count in bigram_counts.items():
    #     if count >= threshold:
    #         bigrams.append(bigram)

    collocations = []

    for i in range(len(lemmatized_tokens) - 1):
        for j in range(window_size):
            try:
                collocation = (lemmatized_tokens[i], lemmatized_tokens[i+j+1])

                collocations.append(collocation)
            except:
                pass

    bigram_counts = Counter(collocations)
    filt_bigram_counts = Counter(filtered_bigrams)
    for bigram, count in filt_bigram_counts.items():
        if bigram_counts[bigram] >= threshold:
            bigrams.append(bigram)



    return bigrams

filtered_bigram_list1 = filter_bigrams(lemmatized_tokens,pos_tags,1,10)
#filtered_bigram_list3 = filter_bigrams(lemmatized_tokens,pos_tags,3,10)


"""#Collecation list"""

def generate_collocations(tokens, window_size):
    collocations = []

    for i in range(len(tokens) - 1):
        for j in range(window_size):
            try:
                collocation = (tokens[i], tokens[i+j+1])

                collocations.append(collocation)
            except:
                pass

    return collocations

"""#Probs

in this part marginal probs will be counted as well as bigram probs by just Counts/Total (frequentiest approach)
"""

from scipy.stats import t as t_dist
from scipy.stats import norm
from scipy.stats import chi2, binom

class test_stats_calculator:
    def __init__(self, tokens, window_size=3):
        self.tokens = tokens
        self.coll = self.generate_collocations(tokens,window_size)
        self.token_counter = Counter(self.tokens)
        self.coll_counter = Counter(self.coll)
        self.window_size = window_size
        self.N = len(tokens)*self.window_size

    #Bigram generator.
    def generate_collocations(self, tokens, window_size):
        collocations = []

        for i in range(len(tokens) - 1):
            for j in range(window_size):
                try:
                    collocation = (tokens[i], tokens[i+j+1])

                    collocations.append(collocation)
                except:
                    pass

        return collocations



    def calc_prob(self, itm):

        #sample item
        if isinstance(itm, str):
            #print('Calculating single element probablity...')
            if itm in self.token_counter:
                return self.token_counter[itm]/sum(self.token_counter.values())
            else:
                print('Input does not exist in the text!')
                return

        elif isinstance(itm, tuple):
            #print('Calculating collocations probablity...')
            #Number of collaction occurance divided by total numbe of tokens.
            if itm in self.coll_counter:
                return self.coll_counter[itm]/sum(self.coll_counter.values())
            else:
                print('Input collocation does not exist in the text!')
                return

        else:
            raise TypeError('This type cannot be handeled')


    '''
    t_distribution will use infinite degree of freedom beacuse N is ver large
    Then t dist. will approach to normal distribution. Also due to the lecture
    notes I used one sided version.
    '''
    def t_dist_test(self, collocation, alpha=0.005):
        #for H0
        prob0 = self.calc_prob(collocation[0])
        prob1 = self.calc_prob(collocation[1])

        assert prob0 is not None, 'input words does not exist'
        assert prob1 is not None, 'input words does not exist'
        #H0: items are indipendent
        H0 = prob0 * prob1

        p = self.calc_prob(collocation)

        assert p is not None, 'collocation does not exist'

        S2 = p #p*(1-p)

        t_val = (p - H0)/math.sqrt(S2/self.N)

        if t_val > norm.ppf(1 - alpha):
            return True, t_val #It is collocation
        else:
            return False, t_val



    def chi_square_test(self,collocation, alpha=0.005):
        observed = np.zeros((2,2))
        expected = np.zeros((2,2))

        if not isinstance(collocation, tuple):
            raise TypeError('This type cannot be handeled')

        if collocation not in self.coll_counter:
            print('This collocation does not exist')
            return 0

        #print(self.coll_counter[collocation])
        observed[0,0] = self.coll_counter[collocation]
        observed[1,0] = self.token_counter[collocation[0]]*self.window_size - observed[0,0]
        observed[0,1] = self.token_counter[collocation[1]]*self.window_size - observed[0,0]
        observed[1,1] = self.N -  self.token_counter[collocation[0]]*self.window_size - observed[0,1]


        #Generic part did not worked well
        # expected[0,0] = (self.token_counter[collocation[0]]* self.token_counter[collocation[1]])/self.N
        # expected[1,0] = (self.token_counter[collocation[0]]* (self.N - self.token_counter[collocation[1]]))/self.N
        # expected[0,1] = ((self.N - self.token_counter[collocation[0]])* (self.token_counter[collocation[1]]))/self.N
        # expected[1,1] = ((self.N - self.token_counter[collocation[0]])* (self.N - self.token_counter[collocation[1]]))/self.N



        # expected = expected + 1 #add one smoothing
        # chi_val = 0

        # for i in range(observed.shape[0]):
        #     for j in range(observed.shape[1]):
        #         #print((observed[i,j], expected[i,j]),chi_val)
        #         chi_val += (observed[i,j] - expected[i,j])**2 /expected[i,j]

        #Shorcut worked better
        chi_val = (self.N*(observed[0,0]*observed[1,1]-observed[0,1]*observed[1,0])**2)/(
            (observed[0,0]+observed[0,1])*(observed[0,0]+observed[1,0])*
             (observed[0,1]+observed[1,1])*(observed[1,0]+observed[1,1]))

        df = (observed.shape[0] - 1)*(observed.shape[1] - 1)

        if chi_val > chi2.ppf(1 - alpha,df):
            return True, chi_val #It is collocation
        else:
            return False, chi_val



    def loglikelihood_test(self,collocation, alpha=0.005):
        eps = 5e-324 #math.ulp(0.0) value
        c1 = self.token_counter[collocation[0]]*self.window_size
        c2 = self.token_counter[collocation[1]]*self.window_size
        c12 = self.coll_counter[collocation]
        p = c2/self.N
        p1 = c12/(c1)
        p2 = (c2 - c12) / (self.N-c1)

        LH1 = (binom.pmf(c12,c1,p))*(binom.pmf(c2-c12,self.N-c1,p))
        LH2 = (binom.pmf(c12,c1,p1))*(binom.pmf(c2-c12,self.N-c1,p2))

        if LH1 == 0:
            LH1 = eps
        if LH2 == 0:
            LH2 = eps

        log_val = -2*np.log((LH1/LH2))

        if log_val > chi2.ppf(1 - alpha,1):
            return True, log_val #It is collocation
        else:
            return False, log_val

"""#Report top k"""

def report_topk(k, lemmatized_tokens, pos_tags, test='t_test', window_size = 3, filter_freq=10, ):


    candidate_list = filter_bigrams(lemmatized_tokens,pos_tags,window_size,filter_freq)
    tester = test_stats_calculator(lemmatized_tokens,window_size)

    if test=='t_test':
        df = pd.DataFrame(columns=['Bigram', 't-score', 'c(w1w2)','c(w1)','c(w2)'])

        for collocation in candidate_list:
            new_row = {}
            new_row['Bigram'] = str(collocation[0]) + ' ' + str(collocation[1])
            new_row['t-score'] = tester.t_dist_test(collocation)[1]
            new_row['c(w1w2)'] = tester.coll_counter[collocation]
            new_row['c(w1)'] = tester.token_counter[collocation[0]]
            new_row['c(w2)'] = tester.token_counter[collocation[1]]

            #df = df.append(new_row, ignore_index=True)
            df = pd.concat([df, pd.DataFrame(new_row, index=[0])], ignore_index=True)

        sorted_df = df.sort_values(by='t-score', ascending=False)

        return sorted_df.head(k).reset_index(drop=True)

    elif test=='chi_test':
        df = pd.DataFrame(columns=['Bigram', 'chi-score', 'c(w1w2)','c(w1)','c(w2)'])

        for collocation in candidate_list:
            new_row = {}
            new_row['Bigram'] = str(collocation[0]) + ' ' + str(collocation[1])
            new_row['chi-score'] = tester.chi_square_test(collocation)[1]
            new_row['c(w1w2)'] = tester.coll_counter[collocation]
            new_row['c(w1)'] = tester.token_counter[collocation[0]]
            new_row['c(w2)'] = tester.token_counter[collocation[1]]
            #df = df.append(new_row, ignore_index=True)
            df = pd.concat([df, pd.DataFrame(new_row, index=[0])], ignore_index=True)

        sorted_df = df.sort_values(by='chi-score', ascending=False)

        return sorted_df.head(k).reset_index(drop=True)


    elif test=='log_test':
        df = pd.DataFrame(columns=['Bigram', 'loglikelihood-score', 'c(w1w2)','c(w1)','c(w2)'])

        for collocation in candidate_list:
            new_row = {}
            new_row['Bigram'] = str(collocation[0]) + ' ' + str(collocation[1])
            new_row['loglikelihood-score'] = tester.loglikelihood_test(collocation)[1]
            new_row['c(w1w2)'] = tester.coll_counter[collocation]
            new_row['c(w1)'] = tester.token_counter[collocation[0]]
            new_row['c(w2)'] = tester.token_counter[collocation[1]]
            #df = df.append(new_row, ignore_index=True)
            df = pd.concat([df, pd.DataFrame(new_row, index=[0])], ignore_index=True)

        sorted_df = df.sort_values(by='loglikelihood-score', ascending=False)

        return sorted_df.head(k).reset_index(drop=True)

    else:
        print('test does not exist')
        return

# top_k = report_topk(20,lemmatized_tokens,pos_tags,'t_test', window_size=1)
# print(top_k)

# top_k = report_topk(20,lemmatized_tokens,pos_tags,'chi_test', window_size=1)
# print(top_k)

"""#Answers

part 1a
"""

print('Total number of tokens: ',len(tokens))

"""part 1b"""

lemmatized_counter = Counter(lemmatized_tokens)

print('Number of "that": ', lemmatized_counter['that'] )
print('Number of "the": ', lemmatized_counter['the'] )
print('Number of "abject": ', lemmatized_counter['abject'] )
print('Number of "london": ', lemmatized_counter['london'] )
print('Number of ".": ', lemmatized_counter['.'] )

tester1 = test_stats_calculator(lemmatized_tokens,1)
print('Number of (“magnificent”,“capital”) with window size 1 =', tester1.coll_counter[('magnificent','capital')])

tester3 = test_stats_calculator(lemmatized_tokens,3)
print('Number of (“bright”,"fire") with window size 3 =', tester3.coll_counter[('bright','fire')])

filtered_bigram_list1 = filter_bigrams(lemmatized_tokens,pos_tags,1,10)
filtered_bigram_list3 = filter_bigrams(lemmatized_tokens,pos_tags,3,10)



print('Is Mr skimpole exist: ', ('mr.', 'skimpole') in filtered_bigram_list1)
print('Is spontaneous combustion exist: ', ('spontaneous','combustion') in filtered_bigram_list3)

print('Number of occurance of spontaneous combustion:',tester3.coll_counter[('spontaneous','combustion')])

"""part 2"""

top_k_t1 = report_topk(20,lemmatized_tokens,pos_tags,'t_test',window_size = 1)
print('t-test with window size 1')
print(top_k_t1)

top_k_chi1 = report_topk(20,lemmatized_tokens,pos_tags,'chi_test',window_size = 1)
print('chi-test with window size 1')
print(top_k_chi1)

top_k_log1 = report_topk(20,lemmatized_tokens,pos_tags,'log_test',window_size = 1)
print('likelihood ratio test with window size 1')
print(top_k_log1)

top_k_t3 = report_topk(20,lemmatized_tokens,pos_tags,'t_test',window_size = 3)
print('t-test with window size 3')
print(top_k_t3)

top_k_chi3 = report_topk(20,lemmatized_tokens,pos_tags,'chi_test',window_size = 3)
print('chi-test with window size 3')
print(top_k_chi3)

top_k_log3 = report_topk(20,lemmatized_tokens,pos_tags,'log_test',window_size = 3)
print('likelihood ratio test with window size 3')
print(top_k_log3)

"""Part 3"""

tester1 = test_stats_calculator(lemmatized_tokens,1)

print('for Head Clerk')
part3coll = ('head', 'clerk')
alpha = 0.005
coll_bool_t, t_score = tester1.t_dist_test(part3coll,alpha)
coll_bool_chi, chi_score = tester1.chi_square_test(part3coll,alpha)
coll_bool_log, log_score = tester1.loglikelihood_test(part3coll,alpha)

print("T-test: ", t_score, 'Collocation?', coll_bool_t)
print("Chi-square Test: ", chi_score, 'Collocation?', coll_bool_chi)
print("Likelihood Ratio Test: ", log_score, 'Collocation?', coll_bool_log)

print('for great man')
part3coll2 = ('great', 'man')
alpha = 0.005
coll_bool_t, t_score = tester1.t_dist_test(part3coll2,alpha)
coll_bool_chi, chi_score = tester1.chi_square_test(part3coll2,alpha)
coll_bool_log, log_score = tester1.loglikelihood_test(part3coll2,alpha)

print("T-test: ", t_score, 'Collocation?', coll_bool_t)
print("Chi-square Test: ", chi_score, 'Collocation?', coll_bool_chi)
print("Likelihood Ratio Test: ", log_score, 'Collocation?', coll_bool_log)





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


number of tokens (1425758,)
Example postags [('part', 'NOUN'), ('i', 'VERB'), ('chapter', 'NOUN'), ('i', 'NOUN'), ('on', 'ADP'), ('an', 'DET'), ('exceptionally', 'ADV'), ('hot', 'ADJ'), ('evening', 'VERB'), ('early', 'ADJ')]
example lemmatized vs normal: ['obliged', 'to', 'pass', 'her', 'kitchen', ',', 'the', 'door', 'of'] ['obliged', 'to', 'pass', 'her', 'kitchen', ',', 'the', 'door', 'of']
Total number of tokens:  1425758
Number of "that":  19429
Number of "the":  48392
Number of "abject":  21
Number of "london":  2
Number of ".":  51738
Number of (“magnificent”,“capital”) with window size 1 = 1
Number of (“bright”,"fire") with window size 3 = 1
Is Mr skimpole exist:  False
Is spontaneous combustion exist:  False
Number of occurance of spontaneous combustion: 1
t-test with window size 1
                     Bigram   t-score c(w1w2) c(w1) c(w2)
0       stepan trofimovitch 22.619077     512   525   513
1        pyotr stepanovitch 22.547839     509   834   509
2          varvara petrovn

In [None]:
# -*- coding: utf-8 -*-
"""Code_Final_NLP_Ass3.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ag0DiEqg3XwnmZ_H6hiyWm7PR4h65TDp
"""

# -*- coding: utf-8 -*-
"""NLP_ASS3v5_final.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1nEM3ad9fSTR7cAomsUBSeoRF3Kf2XVhR
"""

import os
from google.colab import files
if 'Fyodor Dostoyevski Processed.txt' not in os.listdir():
    print('Please upload the document Fyodor Dostoyevski Processed.txt')
    uploaded = files.upload()

import nltk
nltk.download('punkt') #required for tokenizer
nltk.download('averaged_perceptron_tagger') #required for pos_tag
nltk.download('universal_tagset')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import numpy as np
from collections import Counter
import math
import pandas as pd
pd.set_option('display.float_format', '{:.6f}'.format)


# Load the stopwords
stopwords_list = stopwords.words('english')

"""Read the txt file and tokenize it with nltk"""

with open('/content/Fyodor Dostoyevski Processed.txt', 'r') as f:
    text = f.read()

tokens = nltk.tokenize.word_tokenize(text)

print('number of tokens', np.array(tokens).shape) #total number of tokens

"""POS tags"""

pos_tags = nltk.tag.pos_tag(tokens, tagset='universal')

#Example tags
print('Example postags', pos_tags[0:10])

"""Custom Lemantizer class"""

class custom_lemmatizer:

    tag_dict = {
        "ADJ": wordnet.ADJ,
        "NOUN": wordnet.NOUN,
        # "VERB": wordnet.VERB,
        # "ADV": wordnet.ADV
    }

    lemmatizer = WordNetLemmatizer()

    #Retured value is lower case.
    def lemmatize(self, word_pos_tuple):
        word = word_pos_tuple[0]
        pos_tag = word_pos_tuple[1]
        if pos_tag in self.tag_dict:
            return self.lemmatizer.lemmatize(word, self.tag_dict[pos_tag]).lower()
        else:
            return word.lower()

lemmatizer = custom_lemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(pos_tag) for pos_tag in pos_tags]

#obliged mapped to oblige @100
print('example lemmatized vs normal:', lemmatized_tokens[100:109], tokens[100:109])



"""#Candidate Collocation Generator



"""

def filter_bigrams(lemmatized_tokens, pos_tags, window_size,threshold):
    filtered_bigrams = []
    bigrams = []
    for i in range(len(lemmatized_tokens) - 1):
        for j in range(window_size):
            try:
                current_word = lemmatized_tokens[i]
                next_word = lemmatized_tokens[i+j + 1]
                current_pos_tag = pos_tags[i][1]
                next_pos_tag = pos_tags[i+j + 1][1]
                #print(current_word,next_word,current_pos_tag,next_pos_tag)

                # Check if both words consist only of alphabetical characters and are not stopwords
                if current_word.isalpha() and next_word.isalpha() and current_word not in stopwords_list and next_word not in stopwords_list:
                    if (current_pos_tag == 'NOUN' and next_pos_tag == 'NOUN') or (current_pos_tag == 'ADJ' and next_pos_tag == 'NOUN'):
                        filtered_bigrams.append((current_word, next_word))
            except:
                pass


    # bigram_counts = Counter(filtered_bigrams)
    # #print(filtered_bigrams)
    # for bigram, count in bigram_counts.items():
    #     if count >= threshold:
    #         bigrams.append(bigram)

    collocations = []

    for i in range(len(lemmatized_tokens) - 1):
        for j in range(window_size):
            try:
                collocation = (lemmatized_tokens[i], lemmatized_tokens[i+j+1])

                collocations.append(collocation)
            except:
                pass

    bigram_counts = Counter(collocations)
    filt_bigram_counts = Counter(filtered_bigrams)
    for bigram, count in filt_bigram_counts.items():
        if bigram_counts[bigram] >= threshold:
            bigrams.append(bigram)



    return bigrams

filtered_bigram_list1 = filter_bigrams(lemmatized_tokens,pos_tags,1,10)
#filtered_bigram_list3 = filter_bigrams(lemmatized_tokens,pos_tags,3,10)


"""#Collecation list"""

def generate_collocations(tokens, window_size):
    collocations = []

    for i in range(len(tokens) - 1):
        for j in range(window_size):
            try:
                collocation = (tokens[i], tokens[i+j+1])

                collocations.append(collocation)
            except:
                pass

    return collocations

"""#Probs

in this part marginal probs will be counted as well as bigram probs by just Counts/Total (frequentiest approach)
"""

from scipy.stats import t as t_dist
from scipy.stats import norm
from scipy.stats import chi2, binom

class test_stats_calculator:
    def __init__(self, tokens, window_size=3):
        self.tokens = tokens
        self.coll = self.generate_collocations(tokens,window_size)
        self.token_counter = Counter(self.tokens)
        self.coll_counter = Counter(self.coll)
        self.window_size = window_size
        self.N = len(tokens)*self.window_size

    #Bigram generator.
    def generate_collocations(self, tokens, window_size):
        collocations = []

        for i in range(len(tokens) - 1):
            for j in range(window_size):
                try:
                    collocation = (tokens[i], tokens[i+j+1])

                    collocations.append(collocation)
                except:
                    pass

        return collocations



    def calc_prob(self, itm):

        #sample item
        if isinstance(itm, str):
            #print('Calculating single element probablity...')
            if itm in self.token_counter:
                return self.token_counter[itm]/sum(self.token_counter.values())
            else:
                print('Input does not exist in the text!')
                return

        elif isinstance(itm, tuple):
            #print('Calculating collocations probablity...')
            #Number of collaction occurance divided by total numbe of tokens.
            if itm in self.coll_counter:
                return self.coll_counter[itm]/sum(self.coll_counter.values())
            else:
                print('Input collocation does not exist in the text!')
                return

        else:
            raise TypeError('This type cannot be handeled')


    '''
    t_distribution will use infinite degree of freedom beacuse N is ver large
    Then t dist. will approach to normal distribution. Also due to the lecture
    notes I used one sided version.
    '''
    def t_dist_test(self, collocation, alpha=0.005):
        #for H0
        prob0 = self.calc_prob(collocation[0])
        prob1 = self.calc_prob(collocation[1])

        assert prob0 is not None, 'input words does not exist'
        assert prob1 is not None, 'input words does not exist'
        #H0: items are indipendent
        H0 = prob0 * prob1

        p = self.calc_prob(collocation)

        assert p is not None, 'collocation does not exist'

        S2 = p #p*(1-p)

        t_val = (p - H0)/math.sqrt(S2/self.N)

        if t_val > norm.ppf(1 - alpha):
            return True, t_val #It is collocation
        else:
            return False, t_val



    def chi_square_test(self,collocation, alpha=0.005):
        observed = np.zeros((2,2))
        expected = np.zeros((2,2))

        if not isinstance(collocation, tuple):
            raise TypeError('This type cannot be handeled')

        if collocation not in self.coll_counter:
            print('This collocation does not exist')
            return 0

        #print(self.coll_counter[collocation])
        observed[0,0] = self.coll_counter[collocation]
        observed[1,0] = self.token_counter[collocation[0]]*self.window_size - observed[0,0]
        observed[0,1] = self.token_counter[collocation[1]]*self.window_size - observed[0,0]
        observed[1,1] = self.N -  self.token_counter[collocation[0]]*self.window_size - observed[0,1]


        #Generic part did not worked well
        # expected[0,0] = (self.token_counter[collocation[0]]* self.token_counter[collocation[1]])/self.N
        # expected[1,0] = (self.token_counter[collocation[0]]* (self.N - self.token_counter[collocation[1]]))/self.N
        # expected[0,1] = ((self.N - self.token_counter[collocation[0]])* (self.token_counter[collocation[1]]))/self.N
        # expected[1,1] = ((self.N - self.token_counter[collocation[0]])* (self.N - self.token_counter[collocation[1]]))/self.N



        # expected = expected + 1 #add one smoothing
        # chi_val = 0

        # for i in range(observed.shape[0]):
        #     for j in range(observed.shape[1]):
        #         #print((observed[i,j], expected[i,j]),chi_val)
        #         chi_val += (observed[i,j] - expected[i,j])**2 /expected[i,j]

        #Shorcut worked better
        chi_val = (self.N*(observed[0,0]*observed[1,1]-observed[0,1]*observed[1,0])**2)/(
            (observed[0,0]+observed[0,1])*(observed[0,0]+observed[1,0])*
             (observed[0,1]+observed[1,1])*(observed[1,0]+observed[1,1]))

        df = (observed.shape[0] - 1)*(observed.shape[1] - 1)

        if chi_val > chi2.ppf(1 - alpha,df):
            return True, chi_val #It is collocation
        else:
            return False, chi_val



    def loglikelihood_test(self,collocation, alpha=0.005):
        eps = 5e-324 #math.ulp(0.0) value
        c1 = self.token_counter[collocation[0]]*self.window_size
        c2 = self.token_counter[collocation[1]]*self.window_size
        c12 = self.coll_counter[collocation]
        p = c2/self.N
        p1 = c12/(c1)
        p2 = (c2 - c12) / (self.N-c1)

        LH1 = (binom.pmf(c12,c1,p))*(binom.pmf(c2-c12,self.N-c1,p))
        LH2 = (binom.pmf(c12,c1,p1))*(binom.pmf(c2-c12,self.N-c1,p2))

        if LH1 == 0:
            LH1 = eps
        if LH2 == 0:
            LH2 = eps

        log_val = -2*np.log((LH1/LH2))

        if log_val > chi2.ppf(1 - alpha,1):
            return True, log_val #It is collocation
        else:
            return False, log_val

"""#Report top k"""

def report_topk(k, lemmatized_tokens, pos_tags, test='t_test', window_size = 3, filter_freq=10, ):


    candidate_list = filter_bigrams(lemmatized_tokens,pos_tags,window_size,filter_freq)
    tester = test_stats_calculator(lemmatized_tokens,window_size)

    if test=='t_test':
        df = pd.DataFrame(columns=['Bigram', 't-score', 'c(w1w2)','c(w1)','c(w2)'])

        for collocation in candidate_list:
            new_row = {}
            new_row['Bigram'] = str(collocation[0]) + ' ' + str(collocation[1])
            new_row['t-score'] = tester.t_dist_test(collocation)[1]
            new_row['c(w1w2)'] = tester.coll_counter[collocation]
            new_row['c(w1)'] = tester.token_counter[collocation[0]]
            new_row['c(w2)'] = tester.token_counter[collocation[1]]

            #df = df.append(new_row, ignore_index=True)
            df = pd.concat([df, pd.DataFrame(new_row, index=[0])], ignore_index=True)

        sorted_df = df.sort_values(by='t-score', ascending=False)

        return sorted_df.head(k).reset_index(drop=True)

    elif test=='chi_test':
        df = pd.DataFrame(columns=['Bigram', 'chi-score', 'c(w1w2)','c(w1)','c(w2)'])

        for collocation in candidate_list:
            new_row = {}
            new_row['Bigram'] = str(collocation[0]) + ' ' + str(collocation[1])
            new_row['chi-score'] = tester.chi_square_test(collocation)[1]
            new_row['c(w1w2)'] = tester.coll_counter[collocation]
            new_row['c(w1)'] = tester.token_counter[collocation[0]]
            new_row['c(w2)'] = tester.token_counter[collocation[1]]
            #df = df.append(new_row, ignore_index=True)
            df = pd.concat([df, pd.DataFrame(new_row, index=[0])], ignore_index=True)

        sorted_df = df.sort_values(by='chi-score', ascending=False)

        return sorted_df.head(k).reset_index(drop=True)


    elif test=='log_test':
        df = pd.DataFrame(columns=['Bigram', 'loglikelihood-score', 'c(w1w2)','c(w1)','c(w2)'])

        for collocation in candidate_list:
            new_row = {}
            new_row['Bigram'] = str(collocation[0]) + ' ' + str(collocation[1])
            new_row['loglikelihood-score'] = tester.loglikelihood_test(collocation)[1]
            new_row['c(w1w2)'] = tester.coll_counter[collocation]
            new_row['c(w1)'] = tester.token_counter[collocation[0]]
            new_row['c(w2)'] = tester.token_counter[collocation[1]]
            #df = df.append(new_row, ignore_index=True)
            df = pd.concat([df, pd.DataFrame(new_row, index=[0])], ignore_index=True)

        sorted_df = df.sort_values(by='loglikelihood-score', ascending=False)

        return sorted_df.head(k).reset_index(drop=True)

    else:
        print('test does not exist')
        return

# top_k = report_topk(20,lemmatized_tokens,pos_tags,'t_test', window_size=1)
# print(top_k)

# top_k = report_topk(20,lemmatized_tokens,pos_tags,'chi_test', window_size=1)
# print(top_k)

"""#Answers

part 1a
"""

print('Total number of tokens: ',len(tokens))

"""part 1b"""

lemmatized_counter = Counter(lemmatized_tokens)

print('Number of "that": ', lemmatized_counter['that'] )
print('Number of "the": ', lemmatized_counter['the'] )
print('Number of "abject": ', lemmatized_counter['abject'] )
print('Number of "london": ', lemmatized_counter['london'] )
print('Number of ".": ', lemmatized_counter['.'] )

tester1 = test_stats_calculator(lemmatized_tokens,1)
print('Number of (“magnificent”,“capital”) with window size 1 =', tester1.coll_counter[('magnificent','capital')])

tester3 = test_stats_calculator(lemmatized_tokens,3)
print('Number of (“bright”,"fire") with window size 3 =', tester3.coll_counter[('bright','fire')])

filtered_bigram_list1 = filter_bigrams(lemmatized_tokens,pos_tags,1,10)
filtered_bigram_list3 = filter_bigrams(lemmatized_tokens,pos_tags,3,10)



print('Is Mr skimpole exist: ', ('mr.', 'skimpole') in filtered_bigram_list1)
print('Is spontaneous combustion exist: ', ('spontaneous','combustion') in filtered_bigram_list3)

print('Number of occurance of spontaneous combustion:',tester3.coll_counter[('spontaneous','combustion')])

"""part 2"""

top_k_t1 = report_topk(20,lemmatized_tokens,pos_tags,'t_test',window_size = 1)
print('t-test with window size 1')
print(top_k_t1)

top_k_chi1 = report_topk(20,lemmatized_tokens,pos_tags,'chi_test',window_size = 1)
print('chi-test with window size 1')
print(top_k_chi1)

top_k_log1 = report_topk(20,lemmatized_tokens,pos_tags,'log_test',window_size = 1)
print('likelihood ratio test with window size 1')
print(top_k_log1)

top_k_t3 = report_topk(20,lemmatized_tokens,pos_tags,'t_test',window_size = 3)
print('t-test with window size 3')
print(top_k_t3)

top_k_chi3 = report_topk(20,lemmatized_tokens,pos_tags,'chi_test',window_size = 3)
print('chi-test with window size 3')
print(top_k_chi3)

top_k_log3 = report_topk(20,lemmatized_tokens,pos_tags,'log_test',window_size = 3)
print('likelihood ratio test with window size 3')
print(top_k_log3)

"""Part 3"""

tester1 = test_stats_calculator(lemmatized_tokens,1)

print('for Head Clerk')
part3coll = ('head', 'clerk')
alpha = 0.005
coll_bool_t, t_score = tester1.t_dist_test(part3coll,alpha)
coll_bool_chi, chi_score = tester1.chi_square_test(part3coll,alpha)
coll_bool_log, log_score = tester1.loglikelihood_test(part3coll,alpha)

print("T-test: ", t_score, 'Collocation?', coll_bool_t)
print("Chi-square Test: ", chi_score, 'Collocation?', coll_bool_chi)
print("Likelihood Ratio Test: ", log_score, 'Collocation?', coll_bool_log)

print('for great man')
part3coll2 = ('great', 'man')
alpha = 0.005
coll_bool_t, t_score = tester1.t_dist_test(part3coll2,alpha)
coll_bool_chi, chi_score = tester1.chi_square_test(part3coll2,alpha)
coll_bool_log, log_score = tester1.loglikelihood_test(part3coll2,alpha)

print("T-test: ", t_score, 'Collocation?', coll_bool_t)
print("Chi-square Test: ", chi_score, 'Collocation?', coll_bool_chi)
print("Likelihood Ratio Test: ", log_score, 'Collocation?', coll_bool_log)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


number of tokens (1425758,)
Example postags [('part', 'NOUN'), ('i', 'VERB'), ('chapter', 'NOUN'), ('i', 'NOUN'), ('on', 'ADP'), ('an', 'DET'), ('exceptionally', 'ADV'), ('hot', 'ADJ'), ('evening', 'VERB'), ('early', 'ADJ')]
example lemmatized vs normal: ['obliged', 'to', 'pass', 'her', 'kitchen', ',', 'the', 'door', 'of'] ['obliged', 'to', 'pass', 'her', 'kitchen', ',', 'the', 'door', 'of']
Total number of tokens:  1425758
Number of "that":  19429
Number of "the":  48392
Number of "abject":  21
Number of "london":  2
Number of ".":  51738
Number of (“magnificent”,“capital”) with window size 1 = 1
Number of (“bright”,"fire") with window size 3 = 1
Is Mr skimpole exist:  False
Is spontaneous combustion exist:  False
Number of occurance of spontaneous combustion: 1
t-test with window size 1
                     Bigram   t-score c(w1w2) c(w1) c(w2)
0       stepan trofimovitch 22.619077     512   525   513
1        pyotr stepanovitch 22.547839     509   834   509
2          varvara petrovn