- Ficher de process general permettant la gestion de batch ainsi que le test de differentes combinaison d'embedding et de models
- Batch permet d'executer les gros datasets permettant a tout le monde a l'executer (et surement le prof si necessaire)
- Facilitte la vision des fonctions inutile (Et ainsi eviter de perdre des points)

# Importation

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from transformers import GPT2Tokenizer, GPT2Model
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
import torch
import os
import pickle

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
import time

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [2]:
Generation_folder = "generated"

# Fonction d'embedding

In [3]:
def retrieve_simple(data,model=None):
    size_vocabulary = 1000
    X = None
    if model :
        X = model.transform(data)
    else:
        model = CountVectorizer(stop_words = "english", max_features = size_vocabulary,ngram_range=(1, 2))
        X = model.fit_transform(data)
    return X.toarray(),model

In [4]:
def retrieve_tf_idf(data,model=None):
    size_vocabulary = 1000
    X = None
    if model :
        X = model.transform(data)
    else:
        model = TfidfVectorizer(stop_words = "english", max_features = size_vocabulary,ngram_range=(1, 2))
        X = model.fit_transform(data)
    return X.toarray(),model

In [5]:
Berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
GPTtokenizer = GPT2Tokenizer.from_pretrained('gpt2')
GPTtokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [6]:
def get_latent_representation(textList,tokenizer,model):
    inputs = tokenizer(textList, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    return cls_embedding, model


In [7]:
embedded_dict = {
    "simple": retrieve_simple,
    "tf_idf": retrieve_tf_idf,
    "bert": lambda data, model : get_latent_representation(data.values.tolist(),Berttokenizer,model),
    "gpt2": lambda data, model : get_latent_representation(data.values.tolist(),GPTtokenizer,model)
}

# Fonction de model

In [8]:
def Deepmodel():
    model = Sequential([
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(5, activation='softmax')  # 5 classes => 5 neurones en sortie
    ])
    
    # Compilation du modèle (labels sous forme d'entiers 1-5)
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',  # Adapté aux labels entiers
                  metrics=['accuracy'])
    
    training_arg = { 
        "epochs":10,
        "batch_size":32,
        "verbose":0
    }

    return model, training_arg

In [24]:
model_dict = {
    "RandomForest": (lambda : (RandomForestClassifier(n_estimators=100), {})),
    "GradienBoost": (lambda : (GradientBoostingClassifier(n_estimators=100), {})),
    "knn": (lambda :( KNeighborsClassifier(n_neighbors=5), {})),
    "naive": (lambda : (MultinomialNB(), {})),
    "logistic": (lambda : (LogisticRegression(max_iter=1000, multi_class='multinomial'), {})),
    "decisiontree": (lambda : DecisionTreeClassifier(), {}),
    "mlpClass": (lambda : (MLPClassifier(hidden_layer_sizes=(100,), max_iter=500), {})),
    "deep": Deepmodel
}

In [25]:
def process_model(X,Y,model=None,arg={}):
    if model in model_dict:
        model, arg = model_dict[model]()
    
    model.fit(X, Y,**arg)
    return model, arg

# Prediction Transform

In [11]:
pred_transform_dict = {
    "deep": (lambda data: np.argmax(data, axis=1) + 1 ), # argmax renvoit l'indice de la valeur la plus haute du tableau
}


# Fonction utilitaires

In [12]:
def expand_matrix_column(df, column_name):
    # Convertir la liste de listes en array numpy
    matrix = np.array(df[column_name].tolist())
    # Créer des noms de colonnes
    column_names = [f"{column_name}_{i}" for i in range(matrix.shape[1])]
    # Retourner un nouveau DataFrame avec les colonnes expandées
    return pd.DataFrame(matrix, columns=column_names)

In [13]:
def BatchDataframe(df, Batch_size):
    """
    Renvoie une liste de sous-DataFrames en batchs de taille Batch_size, après mélange des données.

    Paramètres :
    df : pandas.DataFrame
        Le DataFrame à diviser en lots.
    Batch_size : int
        La taille de chaque lot.

    Retour :
    list of pandas.DataFrame
        Une liste de DataFrames contenant les lots mélangés.
    """
    
    # Diviser en batchs
    batches = [df.iloc[i:i + Batch_size] for i in range(0, len(df), Batch_size)]
    
    return batches

In [14]:
def PreProcess(data,embedding,summary_embedded_model,comment_embedded_model):
    summary_matrix, summary_embedded_model = embedded_dict[embedding](data['summary'],summary_embedded_model)
    comment_matrix, comment_embedded_model = embedded_dict[embedding](data['comment'],comment_embedded_model)
    
    data['summary'] = list(summary_matrix)
    data['comment'] = list(comment_matrix)

    data = data.drop(columns=['Titre']) # A faire plus tot

    X_simple = data.drop(columns=['rating'])
    Y_simple = data['rating'] 

    X_add = pd.concat([
                    expand_matrix_column(data, 'summary'),
                    expand_matrix_column(data, 'comment')
                ], axis=1)
    
    X_simple.drop(columns=['summary', 'comment'])

    X_expanded = pd.concat([
                    X_simple,
                    X_add
                ], axis=1)
    return X_add, Y_simple, summary_embedded_model, comment_embedded_model

# Process complet

In [15]:
def BatchProcess(data,embedding="simple",model="RandomForest",BatchSize=3000,enable_batch=True):
    data_train, data_test = train_test_split(data, test_size=0.2)

    training_arg = {}

    if enable_batch:
        batch_train = BatchDataframe(data_train,BatchSize)
    else:
        batch_train = [data_train]

    summary_embedded_model = None
    comment_embedded_model = None
    if(embedding=="bert"):
        summary_embedded_model = BertModel.from_pretrained('bert-base-uncased')
        comment_embedded_model = summary_embedded_model
    elif(embedding=="gpt2"):
        summary_embedded_model = GPT2Model.from_pretrained("gpt2")
        comment_embedded_model = summary_embedded_model
    model_chosen = model

    print("Training the model:")
    for true_batch in tqdm(batch_train, desc="Training Progress"):
        batch = true_batch.copy()

        X_expanded,Y_simple,summary_embedded_model,comment_embedded_model = PreProcess(batch,embedding,summary_embedded_model,comment_embedded_model)
        
        model_chosen, training_arg = process_model(X_expanded,Y_simple,model_chosen, training_arg)
    
    accuracy_process = np.array([])
    
    if enable_batch:
        batch_test = BatchDataframe(data_test,BatchSize)
    else:
        batch_test = [data_test]

    print("\nEvaluating the model:")
    for true_batch in tqdm(batch_test, desc="Evaluation Progress"):
        batch = true_batch.copy()

        X_expanded,Y_simple,summary_embedded_model,comment_embedded_model = PreProcess(batch,embedding,summary_embedded_model,comment_embedded_model)

        y_pred = model_chosen.predict(X_expanded)
        
        if model in pred_transform_dict:
            y_pred = pred_transform_dict[model](y_pred)

        accuracy = accuracy_score(Y_simple, y_pred)
        accuracy_process = np.append(accuracy_process,accuracy)

    print(f'Model Accuracy: {np.mean(accuracy_process):.2f}')
    return model_chosen

In [16]:
ratings = pd.read_csv(os.path.join(Generation_folder, 'ratings_formatted.csv')).dropna(subset=['summary']).dropna(subset=['comment'])

# Embedding disponible

In [17]:
print(*list(embedded_dict.keys()),sep="\n")

simple
tf_idf
bert
gpt2


# Model disponible

In [18]:
print(*list(model_dict.keys()),sep="\n")

RandomForest
GradienBoost
knn
naive
logistic
decisiontree
mlpClass
deep


# Execute Process

In [27]:
modelProcess = BatchProcess(ratings.head(10000),embedding="tf_idf",model="deep",BatchSize=1000,enable_batch=True)

Training the model:


Training Progress: 100%|██████████| 8/8 [00:12<00:00,  1.58s/it]



Evaluating the model:


Evaluation Progress:   0%|          | 0/2 [00:00<?, ?it/s]



Evaluation Progress:  50%|█████     | 1/2 [00:00<00:00,  2.98it/s]



Evaluation Progress: 100%|██████████| 2/2 [00:00<00:00,  3.10it/s]

Model Accuracy: 0.05





In [20]:
# Sauvegarde du modèle
with open(os.path.join(Generation_folder,"model.pkl"), "wb") as f:
    pickle.dump(modelProcess, f)

In [21]:
# Chargement du modèle
with open(os.path.join(Generation_folder,"model.pkl"), "rb") as f:
    modelPickle = pickle.load(f)

In [22]:
modelPickle

<keras.src.engine.sequential.Sequential at 0x214991dc0d0>