In [2]:
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np


sys.path.append('../utils/')
sys.path.append('..')
from preprocessing import load_preprocessed_dataframe

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Samy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Samy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Samy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Chargement des données prétraitées
Ici on charge les ensembles de données d'entraînement, de validation et de test en utilisant une fonction personnalisée définie dans le module de prétraitement.


In [3]:
output_dir = 'data'
df_train, df_val, df_test = load_preprocessed_dataframe(output_dir=output_dir)

5 sec TO LOAD


# Affichage des exemples de commentaires
Ici on affiche quelques exemples de commentaires du jeu de données d'entraînement pour avoir un aperçu des données textuelles sur lesquelles nous travaillons.


In [5]:
print(df_train["comment_text_baseline"])

140030    Grandma Terri Should Burn in Trash \nGrandma T...
159124    , 9 May 2009 (UTC)\nIt would be easiest if you...
60006     "\n\nThe Objectivity of this Discussion is dou...
65432                 Shelly Shock\nShelly Shock is. . .( )
154979    I do not care. Refer to Ong Teng Cheong talk p...
                                ...                        
119879    REDIRECT Talk:John Loveday (experimental physi...
103694    Back it up. Post the line here with the refere...
131932    I won't stop that. Sometimes Germanic equals G...
146867    "\n\n British Bands?  \n\nI think you've mista...
121958    You are WRONG. \n\nJustin Thompson is mentione...
Name: comment_text_baseline, Length: 127656, dtype: object


# Concatenation des textes
Ici on concatène les commentaires des ensembles de données d'entraînement, de validation et de test en une seule liste de phrases, puis affiche la longueur totale de cette liste.


In [6]:
# Assuming df_train, df_val, df_test contain a column 'text' with the sentences.
all_texts = pd.concat([df_train['comment_text_baseline'], df_val['comment_text_baseline'], df_test['comment_text_baseline']])
sentences = all_texts.tolist()
len(sentences)

223549

# Création du modèle de bigrammes
Ici on utilise `CountVectorizer` pour créer un modèle de bigrammes (séquences de deux mots) à partir des phrases. Le modèle est ensuite ajusté sur les phrases.


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', ngram_range=(2, 2))
X = vectorizer.fit_transform(sentences)


# Calcul et affichage de la fréquence des bigrammes
Ici on calcule la fréquence de chaque bigramme dans le corpus et définit une fonction pour prédire le mot suivant basé sur un mot précédent en utilisant les bigrammes les plus fréquents.


In [9]:

# Count the frequency of each bigram
bigram_frequency = np.asarray(X.sum(axis=0)).ravel()

# Map each bigram to its frequency
bigram_to_freq = dict(zip(vectorizer.get_feature_names_out(), bigram_frequency))

# Sort the bigram_to_freq dictionary by frequency in descending order and print the top 10 bigrams
#for bigram, freq in sorted(bigram_to_freq.items(), key=lambda item: item[1], reverse=True)[:10]:
#    print(f"Bigram: '{bigram}', Frequency: {freq}")

# Function to predict the next word
def predict_next_word(previous_word):
    candidates = {bigram: freq for bigram, freq in bigram_to_freq.items() if bigram.startswith(previous_word + ' ')}
    
    sorted_candidates = {k: v for k, v in sorted(candidates.items(), key=lambda item: item[1], reverse=True)}

    
    #for bigram, freq in sorted_candidates.items():
        #print(f"Bigram: '{bigram}', Frequency: {freq}")

    if not candidates:
        return "No prediction available"
    return max(candidates, key=candidates.get).split()[1]

# Example
print(predict_next_word("you"))


are


# Création du modèle de trigrammes
Ici on utilise `CountVectorizer` pour créer un modèle de trigrammes (séquences de trois mots) à partir des phrases. Le modèle est ensuite ajusté sur les phrases.


In [10]:
vectorizer_3gram = CountVectorizer(token_pattern=r'\b\w+\b', ngram_range=(3, 3))
X_3gram = vectorizer_3gram.fit_transform(sentences)

# Calcul et affichage de la fréquence des trigrammes
Ici on calcule la fréquence de chaque trigramme dans le corpus et définit une fonction pour prédire le mot suivant basé sur les deux mots précédents en utilisant les trigrammes les plus fréquents.


In [11]:
# Count the frequency of each bigram
trigram_frequency = np.asarray(X_3gram.sum(axis=0)).ravel()

# Map each bigram to its frequency
trigram_to_freq = dict(zip(vectorizer_3gram.get_feature_names_out(), trigram_frequency))

# Sort the bigram_to_frq dictionary by frequency in descending order and print the top 10 bigrams
#for trigram, freq in sorted(trigram_to_freq.items(), key=lambda item: item[1], reverse=True)[:10]:
    #print(f"Bigram: '{trigram}', Frequency: {freq}")

# Function to predict the next word based on the two previous words
def predict_next_word_trigram(previous_words):
    previous_words = previous_words.lower()
    candidates = {trigram: freq for trigram, freq in trigram_to_freq.items() if trigram.startswith(previous_words + ' ')}

    sorted_candidates = {k: v for k, v in sorted(candidates.items(), key=lambda item: item[1], reverse=True)}

    # Now, print the sorted candidates
    i = 0
    for trigram, freq in sorted_candidates.items():
        print(f"Trgram: '{trigram}', Frequency: {freq}")
        i += 1
        if (i == 10):
            break

    if not candidates:
        return "No prediction available"
    # Extracting the last word of the most frequent trigram following the previous_words
    return max(candidates, key=candidates.get).split()[2]

print(predict_next_word_trigram("upon a"))

Trgram: 'upon a time', Frequency: 26
Trgram: 'upon a non', Frequency: 3
Trgram: 'upon a person', Frequency: 3
Trgram: 'upon a fascinating', Frequency: 2
Trgram: 'upon a full', Frequency: 2
Trgram: 'upon a quick', Frequency: 2
Trgram: 'upon a scarlet', Frequency: 2
Trgram: 'upon a 19', Frequency: 1
Trgram: 'upon a bit', Frequency: 1
Trgram: 'upon a consent', Frequency: 1
time


# Complétion de phrase avec les bigrammes
Ici on définit une fonction pour compléter une phrase en utilisant les bigrammes les plus fréquents. Elle prédit les mots suivants jusqu'à atteindre la longueur maximale spécifiée.


In [12]:
def complete_the_phrase(starting_phrase, max_length=10):
    current_phrase = starting_phrase.strip()
    words = current_phrase.split()
    
    # Continue until reaching the maximum length
    for _ in range(max_length - len(words)):
        last_word = words[-1]
        # Find candidates that start with the last word of the current phrase
        candidates = {bigram: freq for bigram, freq in bigram_to_freq.items() if bigram.startswith(last_word + ' ')}
        if not candidates:
            break  # No candidates found, stop the loop
        # Pick the most frequent continuation (the second word in the bigram)
        next_word = sorted(candidates.items(), key=lambda item: item[1], reverse=True)[0][0].split()[1]
        words.append(next_word)
    
    return ' '.join(words)

# Example usage:
print(complete_the_phrase("Do you like playing"))

Do you like playing the article is a few days


# Complétion de phrase avec les trigrammes
Ici on définit une fonction pour compléter une phrase en utilisant les trigrammes les plus fréquents. Elle prédit les mots suivants jusqu'à atteindre la longueur maximale spécifiée.


In [13]:
def complete_the_phrase_trigram(starting_phrase, max_length=15):
    current_phrase = starting_phrase.strip()
    words = current_phrase.split()
    
    # Ensure there are enough words for trigram prediction
    if len(words) < 2:
        return "Please provide at least two words for the initial phrase."
    
    # Continue until reaching the maximum length
    for _ in range(max_length - len(words)):
        # Use the last two words for the trigram prediction
        last_two_words = ' '.join(words[-2:])
        # Find candidates that start with the last two words of the current phrase
        candidates = {trigram: freq for trigram, freq in trigram_to_freq.items() if trigram.startswith(last_two_words + ' ')}
        if not candidates:
            break  # No candidates found, stop the loop
        # Pick the most frequent continuation (the third word in the trigram)
        next_word = sorted(candidates.items(), key=lambda item: item[1], reverse=True)[0][0].split()[2]
        words.append(next_word)
    
    return ' '.join(words)

# Example usage:
print(complete_the_phrase_trigram("once upon"))


once upon a time when i was just a few days ago i m not


# Classe NGramPredictor
Ici on définit une classe `NGramPredictor` pour créer un modèle de n-grammes (séquences de n mots) avec des méthodes pour ajuster le modèle sur des phrases, prédire le mot suivant et compléter une phrase.


In [14]:
class NGramPredictor:
    def __init__(self, n=2):
        self.n = n
        self.vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', ngram_range=(n, n))
        self.ngram_to_freq = {}

    def fit(self, sentences):
        X_ngram = self.vectorizer.fit_transform(sentences)
        ngram_frequency = np.asarray(X_ngram.sum(axis=0)).ravel()
        self.ngram_to_freq = dict(zip(self.vectorizer.get_feature_names_out(), ngram_frequency))

    def predict_next_word(self, previous_words):
        previous_words = ' '.join(previous_words.split()[-(self.n-1):]).lower()
        candidates = {ngram: freq for ngram, freq in self.ngram_to_freq.items() if ngram.startswith(previous_words + ' ')}
        
        if not candidates:
            return None  # Changed to return None for easier checking
        
        return max(candidates, key=candidates.get).split()[-1]

    def complete_the_phrase(self, starting_phrase, max_length=10):
        current_phrase = starting_phrase.strip()
        words = current_phrase.split()
        
        # Adjust for n-gram model
        for _ in range(max_length - len(words)):
            if len(words) < self.n - 1:
                return "Please provide more words for the initial phrase."
            
            # For n-grams, use the last n-1 words as context
            previous_words = ' '.join(words[-(self.n-1):])
            next_word = self.predict_next_word(previous_words)
            
            if not next_word:
                break  # No candidates found, stop the loop
            
            words.append(next_word)
        
        return ' '.join(words)
    
    def print_top_ngrams(self, top=10):
        for ngram, freq in sorted(self.ngram_to_freq.items(), key=lambda item: item[1], reverse=True)[:top]:
            print(f"N-Gram: '{ngram}', Frequency: {freq}")


# Exemple d'utilisation du NGramPredictor
Ici on crée une instance de la classe `NGramPredictor` pour les trigrammes et ajuste le modèle sur les phrases.


In [15]:
# Example usage:
ngram_predictor = NGramPredictor(n=4)  # For trigrams
ngram_predictor.fit(sentences)  # Assuming 'sentences' is a list of sentence strings

# Affichage des n-grammes les plus fréquents et complétion de phrase
Ici on affiche les n-grammes les plus fréquents et utilise la méthode `complete_the_phrase` de la classe `NGramPredictor` pour compléter une phrase donnée.


In [16]:
ngram_predictor.print_top_ngrams()
print(ngram_predictor.complete_the_phrase("once upon a", max_length=10))

N-Gram: 'nigger nigger nigger nigger', Frequency: 3411
N-Gram: 'on my talk page', Frequency: 3067
N-Gram: 'be blocked from editing', Frequency: 2761
N-Gram: 'fuck you fuck you', Frequency: 2681
N-Gram: 'you fuck you fuck', Frequency: 2623
N-Gram: 'i don t know', Frequency: 2621
N-Gram: 'you will be blocked', Frequency: 2549
N-Gram: 'if you continue to', Frequency: 2486
N-Gram: 'if you have any', Frequency: 2483
N-Gram: 'i don t think', Frequency: 2464
once upon a time in a steppe far far away


# Évaluation du modèle
Ici on définit une fonction pour évaluer la précision du modèle sur les phrases de test. Elle compare les prédictions du modèle avec les mots réels et calcule la précision.


In [17]:
def evaluate_model(model, test_sentences):
    print(len(test_sentences))
    correct_predictions = 0
    total_predictions = 0
    
    counter = 0
    for sentence in test_sentences:
        counter += 1
        words = sentence.split()
        # Ensure the sentence has more words than n-1 to make a prediction and have a target
        if len(words) >= model.n:
            for i in range(model.n - 1, len(words)):
                context = ' '.join(words[max(0, i - model.n + 1):i])
                actual_next_word = words[i]
                predicted_next_word = model.predict_next_word(context)
                
                if predicted_next_word is not None and predicted_next_word == actual_next_word:
                    correct_predictions += 1
                total_predictions += 1

                if (predicted_next_word is None):
                    print ("None ")
                else:
                    print("Context: " + context + " - Expected: " + words[i] + " Predicted: " + predicted_next_word + " " + str(counter))
        if (counter >= 100):
            break
    
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

# Assuming you have an instance of NGramPredictor called 'ngram_predictor' and it's already fit
# And assuming 'sentences' is your list of test sentences:
test_sentences = df_test['comment_text_baseline'].to_list()
#print(sentences)
test_sentences = test_sentences[:100]
# Example usage:
tested_predictor = NGramPredictor(n=4)  # For trigrams
tested_predictor.fit(sentences)  # Assuming 'sentences' is a list of sentence strings
accuracy = evaluate_model(tested_predictor, test_sentences)
print(f"Accuracy: {accuracy:.4f}")

100
Context: Thank you for - Expected: understanding. Predicted: your 1
None 
None 
None 
Context: I think very - Expected: highly Predicted: early 1
Context: think very highly - Expected: of Predicted: of 1
Context: very highly of - Expected: you Predicted: the 1
Context: highly of you - Expected: and Predicted: and 1
Context: of you and - Expected: would Predicted: your 1
Context: you and would - Expected: not Predicted: not 1
Context: and would not - Expected: revert Predicted: be 1
Context: would not revert - Expected: without Predicted: a 1
Context: not revert without - Expected: discussion. Predicted: confirmation 1
None 
Context: god this site - Expected: is Predicted: is 2
Context: this site is - Expected: horrible. Predicted: a 2
None 
Context: Somebody will invariably - Expected: try Predicted: try 3
Context: will invariably try - Expected: to Predicted: to 3
Context: invariably try to - Expected: add Predicted: add 3
Context: try to add - Expected: Religion? Predicted: some 