# Importation

In [43]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
import torch
import os

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
import time

In [44]:
Generation_folder = "generated"

# Fonction d'embedding

In [45]:
def retrieve_simple(data,model=None):
    size_vocabulary = 1000
    X = None
    if model :
        X = model.transform(data)
    else:
        model = CountVectorizer(stop_words = "english", max_features = size_vocabulary,ngram_range=(1, 2))
        X = model.fit_transform(data)
    return X.toarray(),model

In [46]:
def retrieve_tf_idf(data,model=None):
    size_vocabulary = 1000
    X = None
    if model :
        X = model.transform(data)
    else:
        model = TfidfVectorizer(stop_words = "english", max_features = size_vocabulary,ngram_range=(1, 2))
        X = model.fit_transform(data)
    return X.toarray(),model

In [47]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [48]:
def get_bert_latent_representation(textList,tokenizer,model):
    inputs = tokenizer(textList, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    return cls_embedding


In [49]:
embedded_dict = {
    "simple": retrieve_simple,
    "tf_idf": retrieve_tf_idf,
    "bert": lambda data, model : get_bert_latent_representation(data.values.tolist(),tokenizer,model)
}

# Fonction de model

In [50]:
model_dict = {
    "RandomForest": (lambda : RandomForestClassifier(n_estimators=100)),
    "GradienBoost": (lambda : GradientBoostingClassifier(n_estimators=100))
}

In [51]:
def process_model(X,Y,model=None):
    if model in model_dict:
        model = model_dict[model]()
    
    model.fit(X, Y)
    return model

# Fonction utilitaires

In [52]:
def expand_matrix_column(df, column_name):
    # Convertir la liste de listes en array numpy
    matrix = np.array(df[column_name].tolist())
    # Créer des noms de colonnes
    column_names = [f"{column_name}_{i}" for i in range(matrix.shape[1])]
    # Retourner un nouveau DataFrame avec les colonnes expandées
    return pd.DataFrame(matrix, columns=column_names)

In [53]:
def BatchDataframe(df, Batch_size):
    """
    Renvoie une liste de sous-DataFrames en batchs de taille Batch_size, après mélange des données.

    Paramètres :
    df : pandas.DataFrame
        Le DataFrame à diviser en lots.
    Batch_size : int
        La taille de chaque lot.

    Retour :
    list of pandas.DataFrame
        Une liste de DataFrames contenant les lots mélangés.
    """
    
    # Diviser en batchs
    batches = [df.iloc[i:i + Batch_size] for i in range(0, len(df), Batch_size)]
    
    return batches

In [54]:
def PreProcess(data,embedding,summary_embedded_model,comment_embedded_model):
    summary_matrix, summary_embedded_model = embedded_dict[embedding](data['summary'],summary_embedded_model)
    comment_matrix, comment_embedded_model = embedded_dict[embedding](data['comment'],comment_embedded_model)
    
    data['summary'] = list(summary_matrix)
    data['comment'] = list(comment_matrix)

    data = data.drop(columns=['Titre']) # A faire plus tot

    X_simple = data.drop(columns=['rating'])
    Y_simple = data['rating'] 

    X_add = pd.concat([
                    expand_matrix_column(data, 'summary'),
                    expand_matrix_column(data, 'comment')
                ], axis=1)
    
    X_simple.drop(columns=['summary', 'comment'])

    X_expanded = pd.concat([
                    X_simple,
                    X_add
                ], axis=1)
    return X_add, Y_simple, summary_embedded_model, comment_embedded_model

# Process complet

In [55]:
def BatchProcess(data,embedding="simple",model="RandomForest",BatchSize=3000):
    data_train, data_test = train_test_split(data, test_size=0.2)

    batch_train = BatchDataframe(data_train,BatchSize)

    summary_embedded_model = None
    comment_embedded_model = None
    if(embedding=="bert"):
        summary_embedded_model = BertModel.from_pretrained('bert-base-uncased')
        comment_embedded_model = summary_embedded_model
    model_chosen = model

    print("Training the model:")
    for true_batch in tqdm(batch_train, desc="Training Progress"):
        batch = true_batch.copy()

        X_expanded,Y_simple,summary_embedded_model,comment_embedded_model = PreProcess(batch,embedding,summary_embedded_model,comment_embedded_model)
        
        model_chosen = process_model(X_expanded,Y_simple,model_chosen)
    
    accuracy_process = np.array([])


    batch_test = BatchDataframe(data_test,BatchSize)
    print("\nEvaluating the model:")
    for true_batch in tqdm(batch_test, desc="Evaluation Progress"):
        batch = true_batch.copy()

        X_expanded,Y_simple,summary_embedded_model,comment_embedded_model = PreProcess(batch,embedding,summary_embedded_model,comment_embedded_model)

        y_pred = model_chosen.predict(X_expanded)
        accuracy = accuracy_score(Y_simple, y_pred)
        accuracy_process = np.append(accuracy_process,accuracy)

    print(f'Model Accuracy: {np.mean(accuracy_process):.2f}')
    return model_chosen

In [56]:
ratings = pd.read_csv(os.path.join(Generation_folder, 'ratings_formatted.csv')).dropna(subset=['summary']).dropna(subset=['comment'])

In [None]:
BatchProcess(ratings.head(20000),embedding="bert",BatchSize=2000)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training the model:


Training Progress:   0%|          | 0/8 [00:00<?, ?it/s]