In [76]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
import torch
import os

In [77]:
Generation_folder = 'generated' 

In [78]:
def retrieve_simple(data):
    size_vocabulary = 1000
    vectorizer = CountVectorizer(stop_words = "english", max_features = size_vocabulary)
    X = vectorizer.fit_transform(data)
    return X

In [79]:
embedded_dict = {
    "simple": retrieve_simple
}

In [80]:
def retrieve_embedding(data, method="simple"):
    if method in embedded_dict:
        return embedded_dict[method](data)
    else:
        raise Exception("Methode non implementé")

In [83]:
ratings = pd.read_csv(os.path.join(Generation_folder, 'ratings_formatted.csv')).dropna(subset=['summary']).dropna(subset=['comment']).head(10000)

In [84]:
ratings['summary_embedding'] = list(retrieve_simple(ratings['summary']).toarray())
ratings['comment_embedding'] = list(retrieve_simple(ratings['comment']).toarray())


In [85]:
ratings

Unnamed: 0,Titre,rating,summary,comment,utilite_num,auteur_note,auteur_nb,genre_note,genre_nb,editor_note,editor_nb,summary_embedding,comment_embedding
0,Its Only Art If Its Well Hung!,2,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,1.000000,4.000000,1.0,4.335691,13149.0,2.500000,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Dr. Seuss: American Icon,2,Really Enjoyed It,I don't care much for Dr. Seuss but after read...,1.000000,4.600000,10.0,4.258888,107788.0,4.212751,2886.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Dr. Seuss: American Icon,2,Essential for every personal and Public Library,"If people become the books they read and if ""t...",0.909091,4.600000,10.0,4.258888,107788.0,4.212751,2886.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Dr. Seuss: American Icon,2,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",1.000000,4.600000,10.0,4.258888,107788.0,4.212751,2886.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Dr. Seuss: American Icon,2,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...,1.000000,4.600000,10.0,4.258888,107788.0,4.212751,2886.0,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9998,Fish Face,2,"best book ever, picture or otherwise",we bought this book first at the monterey aqua...,0.000000,4.339748,1351.0,4.453755,207535.0,4.188975,118095.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9999,"The Solar Home Book: Heating, Cooling and Desi...",2,kebin,A very good book. Nice clear pictures and diag...,1.000000,5.000000,2.0,4.333333,3.0,4.571429,49.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
10000,"The Solar Home Book: Heating, Cooling and Desi...",2,"Published a Long Time ago, but still relevant ...",The subject of solar energy is more topical to...,0.000000,5.000000,2.0,4.333333,3.0,4.571429,49.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
10001,Foundation,0,Not my style,"The book covers several generations, so with e...",0.000000,4.277138,2432.0,4.275266,19047.0,4.279051,2444.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [52]:
ratings = ratings.drop(columns=['summary','comment','Titre'])

In [53]:
X_simple = ratings.drop(columns=['rating'])
Y_simple = ratings['rating'] 

In [54]:
X_simple_train, X_simple_test, y_simple_train, y_simple_test = train_test_split(X_simple, Y_simple, test_size=0.2)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_simple_train, y_simple_train)

# Predict on test data
y_pred = rf.predict(X_simple_test)

# Evaluate model
accuracy = accuracy_score(y_simple_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.2f}')

ValueError: setting an array element with a sequence.