In [1]:
# NgramCount
import numpy as np
from nltk.util import ngrams
#from unigramI import gen_tok
from typing import Tuple, List, Callable

In [2]:
def gen_tok(file:str):
    with open(file,mode='r') as w:
        for sentence in w.read().splitlines():
            yield sentence.split(',')

In [3]:
class NgramCount:

    def __init__(self,file_name:str)->None:
        self.texte = gen_tok(file_name)
        self.count()

    def count(self):
        self.nb_sent=0
        self.nb_word=0
        self.counts = {}
        self.start_counts = {}

        for sent in self.texte:
            self.nb_sent +=1
            self.nb_word +=len(sent)
            for ngram_length in range(1,6):
                for ngram_position, ngram in enumerate(ngrams(sent,ngram_length)):
                    if ngram_position == 0:
                        self.start_counts[ngram] = self.start_counts.get(ngram,0)+1
                    self.counts[ngram] = self.counts.get(ngram,0) + 1

In [4]:
class NgramModel:

    def __init__(self,train_counter:NgramCount):
        self.counter = train_counter
        self.counts = train_counter.counts
        self.start_counts = train_counter.start_counts
        self.vocab_size = len(list(ngram for ngram in self.counts.keys() if len(ngram)==1))
        self.uniform_prob = 1/(self.vocab_size + 1)

    def calculate_unigram_prob(self,unigram:Tuple[str]):
        if unigram in self.start_counts:
            prob_num = self.start_counts[unigram]
            prob_denom = self.counter.nb_sent
            self.start_probs[unigram] = prob_num/prob_denom
        prob_num = self.counts[unigram]
        prob_denom = self.counter.nb_word
        self.probs[unigram] = prob_num / prob_denom

    def calculate_multigram_prob(self,ngram:Tuple[str]):
        prevgram = ngram[:-1]
        if ngram in self.start_counts:
            prob_num = self.start_counts[ngram]
            prob_denom = self.start_counts[prevgram]
            self.start_probs[ngram] = prob_num / prob_denom
        prob_num = self.counts[ngram]
        prob_denom = self.counts[prevgram]
        self.probs[ngram] = prob_num / prob_denom

    def train(self):
        self.start_probs = {}
        self.probs = {}
        for ngram in self.counts:
            if len(ngram) == 1:
                self.calculate_unigram_prob(ngram)
            else:
                self.calculate_multigram_prob(ngram)

    def evaluate(self,eval_file:str) -> np.ndarray:
        eval_tokan_count = sum(len(sentence) for sentence in gen_tok(eval_file))
        prob_matrix = np.zeros(shape = (eval_tokan_count,6))
        prob_matrix[:,0] = self.uniform_prob
        row = 0
        for sentence in gen_tok(eval_file):
            for token_position, token in enumerate(sentence):
                for ngram_length in range(1,6):
                    ngram_end = token_position + 1
                    ngram_start = token_position + 1 - ngram_length
                    if ngram_start < 0:
                        ngram = tuple(sentence[0:ngram_end])
                        prob_matrix[row,ngram_length] = self.start_probs.get(ngram,0)
                    else:
                        ngram = tuple(sentence[ngram_start:ngram_end])
                        prob_matrix[row, ngram_length] = self.probs.get(ngram,0)
                row +=1
        return prob_matrix

    def loglik(self, prob_matrix:np.ndarray,weights: List[float] = None, log_function : Callable = np.log2)->float:
        n_models = prob_matrix.shape[1]
        if weights == None:
            weights = np.ones(n_models) / n_models
        interpolated_probs = np.sum(prob_matrix*weights, axis = 1)
        avg_log_likelihood = log_function(interpolated_probs).mean()
        return  avg_log_likelihood

In [6]:

if __name__ == '__main__':
    compter = NgramCount('dataset/train.txt')
    model = NgramModel(compter)
    model.train()
    print('average log_lik for dev1 : {0:.2f} and dev2 : {1:.2f}'.format(
        model.loglik(model.evaluate('dataset/dev1.txt')),model.loglik(model.evaluate('dataset/dev2.txt'))))

average log_lik for dev1 : -8.70 and dev2 : -10.05


In [1]:
import spacy

2023-10-08 09:57:58.777538: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-08 09:57:59.173343: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-08 09:58:00.156678: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-10-08 09:58:00.156777: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [6]:
nlp = spacy.load("en_core_web_sm")

In [16]:
doc = nlp("I am Epiphane")

In [25]:
for token in doc:
    print(token.dep_)

nsubj
ROOT
attr
