# Importation

In [124]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
import torch
import os

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
import time

# Fonction d'embedding

In [125]:
def retrieve_simple(data,model=None):
    size_vocabulary = 1000
    X = None
    if model :
        X = model.transform(data)
    else:
        model = CountVectorizer(stop_words = "english", max_features = size_vocabulary,ngram_range=(1, 2))
        X = model.fit_transform(data)
    return X,model

In [126]:
def retrieve_tf_idf(data,model=None):
    size_vocabulary = 1000
    X = None
    if model :
        X = model.transform(data)
    else:
        model = TfidfVectorizer(stop_words = "english", max_features = size_vocabulary,ngram_range=(1, 2))
        X = model.fit_transform(data)
    return X,model

In [127]:
embedded_dict = {
    "simple": retrieve_simple,
    "tf_idf": retrieve_tf_idf
}

# Fonction de model

In [128]:
model_dict = {
    "RandomForest": (lambda : RandomForestClassifier(n_estimators=100)),
    "GradienBoost": (lambda : GradientBoostingClassifier(n_estimators=100))
}

In [129]:
def process_model(X,Y,model=None):
    if model in model_dict:
        model = model_dict[model]()
    
    model.fit(X, Y)
    return model

# Fonction utilitaires

In [130]:
def expand_matrix_column(df, column_name):
    # Convertir la liste de listes en array numpy
    matrix = np.array(df[column_name].tolist())
    # Créer des noms de colonnes
    column_names = [f"{column_name}_{i}" for i in range(matrix.shape[1])]
    # Retourner un nouveau DataFrame avec les colonnes expandées
    return pd.DataFrame(matrix, columns=column_names)

In [131]:
def BatchDataframe(df, Batch_size):
    """
    Renvoie une liste de sous-DataFrames en batchs de taille Batch_size, après mélange des données.

    Paramètres :
    df : pandas.DataFrame
        Le DataFrame à diviser en lots.
    Batch_size : int
        La taille de chaque lot.

    Retour :
    list of pandas.DataFrame
        Une liste de DataFrames contenant les lots mélangés.
    """
    
    # Diviser en batchs
    batches = [df.iloc[i:i + Batch_size] for i in range(0, len(df), Batch_size)]
    
    return batches

In [132]:
def PreProcess(data,embedding,summary_embedded_model,comment_embedded_model):
    summary_matrix, summary_embedded_model = embedded_dict[embedding](data['summary'],summary_embedded_model)
    comment_matrix, comment_embedded_model = embedded_dict[embedding](data['comment'],comment_embedded_model)
    
    data['summary'] = list(summary_matrix.toarray())
    data['comment'] = list(comment_matrix.toarray())

    data = data.drop(columns=['Titre']) # A faire plus tot

    X_simple = data.drop(columns=['rating'])
    Y_simple = data['rating'] 

    X_add = pd.concat([
                    expand_matrix_column(data, 'summary'),
                    expand_matrix_column(data, 'comment')
                ], axis=1)
    
    X_simple.drop(columns=['summary', 'comment'])

    X_expanded = pd.concat([
                    X_simple,
                    X_add
                ], axis=1)
    return X_add, Y_simple, summary_embedded_model, comment_embedded_model

# Process complet

In [135]:
def BatchProcess(data,embedding="simple",model="RandomForest",BatchSize=3000):
    data_train, data_test = train_test_split(data, test_size=0.2)

    batch_train = BatchDataframe(data_train,BatchSize)

    summary_embedded_model = None
    comment_embedded_model = None
    model_chosen = model

    print("Training the model:")
    for true_batch in tqdm(batch_train, desc="Training Progress"):
        batch = true_batch.copy()

        X_expanded,Y_simple,summary_embedded_model,comment_embedded_model = PreProcess(batch,embedding,summary_embedded_model,comment_embedded_model)
        
        model_chosen = process_model(X_expanded,Y_simple,model_chosen)
    
    accuracy_process = np.array([])


    batch_test = BatchDataframe(data_train,BatchSize)
    print("\nEvaluating the model:")
    for true_batch in tqdm(batch_test, desc="Evaluation Progress"):
        batch = true_batch.copy()

        X_expanded,Y_simple,summary_embedded_model,comment_embedded_model = PreProcess(batch,embedding,summary_embedded_model,comment_embedded_model)

        y_pred = model_chosen.predict(X_expanded)
        accuracy = accuracy_score(Y_simple, y_pred)
        accuracy_process = np.append(accuracy_process,accuracy)

    print(f'Model Accuracy: {np.mean(accuracy_process):.2f}')
    return model_chosen

In [120]:
ratings = pd.read_csv(os.path.join(Generation_folder, 'ratings_formatted.csv')).dropna(subset=['summary']).dropna(subset=['comment']).head(8000)

In [136]:
BatchProcess(ratings,BatchSize=2000)

Training the model:


Training Progress: 100%|██████████| 4/4 [00:07<00:00,  1.78s/it]



Evaluating the model:


Evaluation Progress:  25%|██▌       | 1/4 [00:00<00:01,  2.57it/s]

0.7685


Evaluation Progress:  50%|█████     | 2/4 [00:00<00:00,  2.54it/s]

0.767


Evaluation Progress: 100%|██████████| 4/4 [00:01<00:00,  3.13it/s]

0.764
0.9975
Model Accuracy: 0.82





RandomForestClassifier()