In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
import torch
import os

# avent apprentisage

In [23]:
def retrieve_simple(data):
    size_vocabulary = 1000
    vectorizer = TfidfVectorizer(stop_words = "english", max_features = size_vocabulary,ngram_range=(1, 2))
    X = vectorizer.fit_transform(data)
    return X

In [24]:
embedded_dict = {
    "simple": retrieve_simple
}

In [25]:
def retrieve_embedding(data, method="simple"):
    if method in embedded_dict:
        return embedded_dict[method](data)
    else:
        raise Exception("Methode non implementé")

In [26]:
ratings = pd.read_csv('../V0/ratings_formatted.csv').head(10000)


In [27]:
ratings['summary'] = list(retrieve_simple(ratings['summary']).toarray())
ratings['comment'] = list(retrieve_simple(ratings['comment']).toarray())
ratings['summary'].head(2)

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: summary, dtype: object

In [28]:
ratings =ratings.drop(columns=['Titre'])


In [29]:
X_simple = ratings.drop(columns=['rating'])
Y_simple = ratings['rating'] 

In [30]:
# 1. Convertir les colonnes matrices en colonnes individuelles
def expand_matrix_column(df, column_name):
    # Convertir la liste de listes en array numpy
    matrix = np.array(df[column_name].tolist())
    # Créer des noms de colonnes
    column_names = [f"{column_name}_{i}" for i in range(matrix.shape[1])]
    # Retourner un nouveau DataFrame avec les colonnes expandées
    print(pd.DataFrame(matrix, columns=column_names).head(2))
    return pd.DataFrame(matrix, columns=column_names)

# 2. Appliquer la transformation sur chaque colonne contenant une matrice
X_expanded = pd.concat([
    expand_matrix_column(ratings, 'summary'),
    expand_matrix_column(ratings, 'comment')
], axis=1)


   summary_0  summary_1  summary_2  summary_3  summary_4  summary_5  \
0        0.0        0.0        0.0        0.0        0.0        0.0   
1        0.0        0.0        0.0        0.0        0.0        0.0   

   summary_6  summary_7  summary_8  summary_9  ...  summary_990  summary_991  \
0        0.0        0.0        0.0        0.0  ...          0.0          0.0   
1        0.0        0.0        0.0        0.0  ...          0.0          0.0   

   summary_992  summary_993  summary_994  summary_995  summary_996  \
0          0.0          0.0          0.0          0.0          0.0   
1          0.0          0.0          0.0          0.0          0.0   

   summary_997  summary_998  summary_999  
0          0.0          0.0          0.0  
1          0.0          0.0          0.0  

[2 rows x 1000 columns]
   comment_0  comment_1  comment_2  comment_3  comment_4  comment_5  \
0        0.0        0.0        0.0        0.0        0.0        0.0   
1        0.0        0.0        0.0    

# model

## RandomForestClassifier

In [35]:
# 3. Maintenant on peut utiliser ces données pour l'entraînement
X_simple_train, X_simple_test, y_simple_train, y_simple_test = train_test_split(X_expanded, Y_simple, test_size=0.2)
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_simple_train, y_simple_train)

# Prédiction et évaluation
y_pred = model_rf.predict(X_simple_test)
accuracy = accuracy_score(y_simple_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.2f}')


Random Forest Accuracy: 0.59


## KNN

In [36]:
# 3. Maintenant on peut utiliser ces données pour l'entraînement
X_simple_train, X_simple_test, y_simple_train, y_simple_test = train_test_split(X_expanded, Y_simple, test_size=0.2)
model_rf = KNeighborsClassifier(n_neighbors=5)
model_rf.fit(X_simple_train, y_simple_train)

# Prédiction et évaluation
y_pred = model_rf.predict(X_simple_test)
accuracy = accuracy_score(y_simple_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.2f}')


Random Forest Accuracy: 0.57
