# I- Première partie

In [1]:
import math
from collections import defaultdict, Counter
import numpy as np

class NgramLanguageModel:
    def __init__(self):
        self.trigram_counts = defaultdict(int)
        self.bigram_counts = defaultdict(int)
        self.unigram_counts = defaultdict(int)
        self.vocab = set()
        self.k = 0.01  # Smoothing parameter

    def prepare_data(self, data, ngram_size=2, is_file=True):
        if is_file:
            with open(data, 'r') as f:
                text = f.read().lower()
        else:
            text = data.lower()

        sentences = text.split('\n')
        preprocessed_sentences = []

        for sentence in sentences:
            tokens = sentence.split()
            if ngram_size == 2:
                tokens = ['<s>'] + tokens + ['</s>']
            elif ngram_size == 3:
                tokens = ['<s>', '<s>'] + tokens + ['</s>']
            preprocessed_sentences.append(' '.join(tokens))

        preprocessed_corpus = ' '.join(preprocessed_sentences)

        # Handle out-of-vocabulary words
        words = preprocessed_corpus.split()
        word_counts = Counter(words)

        self.vocab = {word for word in word_counts if word_counts[word] >= 1}
        self.vocab.add('<UNK>')

        def replace_oov(word):
            return word if word in self.vocab else '<UNK>'

        preprocessed_corpus = ' '.join(replace_oov(word) for word in words)

        return preprocessed_corpus

    def train(self, ngram_size=2, infile='ngramv1.train'):
        preprocessed_corpus = self.prepare_data(infile, ngram_size, is_file=True)
        tokens = preprocessed_corpus.split()
        
        for i in range(len(tokens)):
                    self.unigram_counts[(tokens[i])] +=1

        if ngram_size == 2:
            for i in range(len(tokens) - 1):
                self.bigram_counts[(tokens[i], tokens[i+1])] += 1
        elif ngram_size == 3:
            for i in range(len(tokens) - 2):
                self.trigram_counts[(tokens[i], tokens[i+1], tokens[i+2])] += 1
        
          
    def predict_ngram(self, sentence, ngram_size=2):
        preprocessed_sentence = self.prepare_data(sentence, ngram_size, is_file=False)
        tokens = preprocessed_sentence.split()
        log_prob = 0.0

        if ngram_size == 2:
            for i in range(len(tokens) - 1):
                log_prob += self.calculate_log_prob_bigram(tokens[i], tokens[i+1])
        elif ngram_size == 3:
            for i in range(len(tokens) - 2):
                log_prob += self.calculate_log_prob_trigram(tokens[i], tokens[i+1], tokens[i+2])

        return log_prob

    def calculate_log_prob_bigram(self, word1, word2):
        count_bigram = self.bigram_counts[(word1, word2)]
        count_unigram = sum(self.bigram_counts[(word1, w)] for w in self.vocab)
        vocab_size = len(self.vocab)
        prob = (count_bigram + self.k) / (count_unigram + self.k * vocab_size)
        return math.log(prob)

    def calculate_log_prob_trigram(self, word1, word2, word3):
        count_trigram = self.trigram_counts[(word1, word2, word3)]
        count_bigram = sum(self.trigram_counts[(word1, word2, w)] for w in self.vocab)
        vocab_size = len(self.vocab)
        prob = (count_trigram + self.k) / (count_bigram + self.k * vocab_size)
        return math.log(prob)

    def test_perplexity(self, test_file, ngram_size=2):
        total_log_prob = 0.0
        total_tokens = 0

        with open(test_file, 'r') as f:
            for line in f:
                sentence = line.strip().lower()
                total_log_prob += self.predict_ngram(sentence, ngram_size)
                total_tokens += len(sentence.split()) + 1  # Adding 1 for the end token

        avg_log_prob = total_log_prob / total_tokens
        perplexity = math.exp(-avg_log_prob)
        return perplexity



####  prepare_data:
This method preprocesses the input corpus by tokenizing, normalizing, and handling out-of-vocabulary words.

In [3]:
def test_prepare_data():
    model = NgramLanguageModel()
    infile = 'ngramv1.train'
    print(' Bigram : \n')
    preprocessed_sentences = model.prepare_data(infile, ngram_size=2)
    preprocessed_sentences
        
    print('\n Trigram : \n')
    preprocessed_sentences = model.prepare_data(infile, ngram_size=3)
    preprocessed_sentences


In [32]:
test_prepare_data()

 Bigram : 

<s> i am sam . </s>
<s> i am sam . </s>
<s> sam i am . </s>
<s> that sam i am ! </s>
<s> that sam i am ! </s>
<s> i do not like that sam i am ! </s>
<s> do would you like green eggs and ham ? </s>
<s> i do not like them , sam i am . </s>
<s> i do not like green eggs and ham . </s>
<s> would you like them here or there ? </s>
<s> i would not like them here or there . </s>
<s> i would not like them anywhere . </s>
<s> i do not like green eggs and ham . </s>
<s> i do not like them , sam i am . </s>
<s> would you like them in a house ? </s>
<s> would you like <UNK> with a mouse ? </s>
<s> i do not like them in a house . </s>
<s> i do not like them with a mouse . </s>
<s> i do not like them here or there . </s>
<s> i do not like them anywhere . </s>
<s> i do not like green eggs and ham . </s>
<s> i do not like them , sam i am . </s>
<s> would you eat them in a box ? </s>
<s> would you eat them with a fox ? </s>
<s> not in a box . </s>
<s> not with a fox . </s>
<s> not in a house

#### train:
This method trains the n-gram model by counting n-grams in the preprocessed corpus.

In [6]:
def test_train():
    model = NgramLanguageModel()
    infile = 'ngramv1.train'
    model.train(ngram_size=2, infile=infile)
    print("Unigram Counts:", dict(model.unigram_counts))
    print('\n')
    print("Bigram Counts:", dict(model.bigram_counts))
    print('\n')
    model.train(ngram_size=3, infile=infile)
    print("Trigram Counts:", dict(model.trigram_counts))

test_train()


Unigram Counts: {'<s>': 111, 'i': 68, 'am': 14, 'sam': 17, '.': 72, '</s>': 111, 'that': 3, '!': 29, 'do': 35, 'not': 82, 'like': 41, 'would': 25, 'you': 31, 'green': 8, 'eggs': 9, 'and': 11, 'ham': 9, '?': 16, 'them': 48, ',': 34, 'here': 10, 'or': 8, 'there': 8, 'anywhere': 7, 'in': 33, 'a': 47, 'house': 7, 'then': 1, 'with': 15, 'mouse': 6, 'eat': 14, 'box': 6, 'fox': 6, 'could': 14, 'car': 6, 'they': 1, 'are': 1, 'may': 4, 'will': 11, 'see': 3, 'tree': 5, 'let': 4, 'me': 4, 'be': 4, 'train': 7, 'on': 5, 'greem': 1, 'say': 3, 'the': 9, 'dark': 6, 'rain': 3, 'goat': 3, 'boat': 2, 'so': 1, 'try': 4, 'if': 1}


Bigram Counts: {('<s>', 'i'): 54, ('i', 'am'): 14, ('am', 'sam'): 2, ('sam', '.'): 2, ('.', '</s>'): 66, ('</s>', '<s>'): 110, ('<s>', 'sam'): 3, ('sam', 'i'): 12, ('am', '.'): 9, ('<s>', 'that'): 2, ('that', 'sam'): 3, ('am', '!'): 3, ('!', '</s>'): 29, ('i', 'do'): 32, ('do', 'not'): 34, ('not', 'like'): 35, ('like', 'that'): 1, ('<s>', 'do'): 1, ('do', 'would'): 1, ('would', 

#### predict_ngram: 
This method calculates the log probability of a given sentence using the trained n-gram model.

In [7]:
def test_predict_ngram():
    model = NgramLanguageModel()
    infile = 'ngramv1.train'
    
    # Train the model
    model.train(ngram_size=2, infile=infile)
    sentence = "I AM SAM ."
    log_prob_bigram = model.predict_ngram(sentence, ngram_size=2)
    print(f"Log Probability (Bigram) for '{sentence}': {log_prob_bigram}")
    
    model.train(ngram_size=3, infile=infile)
    log_prob_trigram = model.predict_ngram(sentence, ngram_size=3)
    print(f"Log Probability (Trigram) for '{sentence}': {log_prob_trigram}")
test_predict_ngram()

Log Probability (Bigram) for 'I AM SAM .': -3.7296761896025306
Log Probability (Trigram) for 'I AM SAM .': -1.8668256450475997


#### test_perplexity: 
This method calculates the perplexity of the model on a test corpus.

a lower perplexity value suggests that the language model is better at predicting the test corpus.

In [8]:
def test_test_perplexity():
    model = NgramLanguageModel()
    infile_train = 'ngramv1.train'
    infile_test = 'ngramv1.test'
    
    # Train the model
    model.train(ngram_size=2, infile=infile_train)
    perplexity_bigram = model.test_perplexity(infile_test, ngram_size=2)
    print(f"Perplexity (Bigram) on test file: {perplexity_bigram}")
    
    model.train(ngram_size=3, infile=infile_train)
    perplexity_trigram = model.test_perplexity(infile_test, ngram_size=3)
    print(f"Perplexity (Trigram) on test file: {perplexity_trigram}")

test_test_perplexity()

Perplexity (Bigram) on test file: 2.9618282961926368
Perplexity (Trigram) on test file: 4.377939131290907
