In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
import torch
import os

from transformers import BertTokenizer

In [2]:
Generation_folder = "generated"

# avent apprentisage

In [17]:
def get_bert_latent_representation(textList,tokenizer,model):
    inputs = tokenizer(textList, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    return cls_embedding



In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
embedded_dict = {
    "simple": lambda data : get_bert_latent_representation(data,tokenizer,model)
}

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def retrieve_embedding(data, method="simple"):
    if method in embedded_dict:
        return embedded_dict[method](data)
    else:
        raise Exception("Methode non implementé")

In [8]:
ratings = pd.read_csv(os.path.join(Generation_folder, 'ratings_formatted.csv'))

In [13]:
sample_text = ratings.head(10)

In [14]:
sample_text['summary'].values.tolist()

['Nice collection of Julie Strain images',
 'Really Enjoyed It',
 'Essential for every personal and Public Library',
 'Phlip Nel gives silly Seuss a serious treatment',
 'Good academic overview',
 "One of America's greatest creative talents",
 "A memorably excellent survey of Dr. Seuss' many achievements",
 "Academia At It's Best",
 'And to think that I read it on the tram!',
 'Fascinating account of a genius at work']

In [21]:
latent_representation = embedded_dict["simple"](sample_text['summary'].values.tolist())
len(latent_representation)

10

In [None]:
sample_text['summary'] = embedded_dict["simple"](sample_text['summary'].values.tolist())
sample_text['comment'] = embedded_dict["simple"](sample_text['comment'].values.tolist())
sample_text['summary'].head(2)

In [28]:
ratings =ratings.drop(columns=['Titre'])


In [29]:
X_simple = ratings.drop(columns=['rating'])
Y_simple = ratings['rating'] 

In [30]:
# 1. Convertir les colonnes matrices en colonnes individuelles
def expand_matrix_column(df, column_name):
    # Convertir la liste de listes en array numpy
    matrix = np.array(df[column_name].tolist())
    # Créer des noms de colonnes
    column_names = [f"{column_name}_{i}" for i in range(matrix.shape[1])]
    # Retourner un nouveau DataFrame avec les colonnes expandées
    print(pd.DataFrame(matrix, columns=column_names).head(2))
    return pd.DataFrame(matrix, columns=column_names)

# 2. Appliquer la transformation sur chaque colonne contenant une matrice
X_expanded = pd.concat([
    expand_matrix_column(ratings, 'summary'),
    expand_matrix_column(ratings, 'comment')
], axis=1)


   summary_0  summary_1  summary_2  summary_3  summary_4  summary_5  \
0        0.0        0.0        0.0        0.0        0.0        0.0   
1        0.0        0.0        0.0        0.0        0.0        0.0   

   summary_6  summary_7  summary_8  summary_9  ...  summary_990  summary_991  \
0        0.0        0.0        0.0        0.0  ...          0.0          0.0   
1        0.0        0.0        0.0        0.0  ...          0.0          0.0   

   summary_992  summary_993  summary_994  summary_995  summary_996  \
0          0.0          0.0          0.0          0.0          0.0   
1          0.0          0.0          0.0          0.0          0.0   

   summary_997  summary_998  summary_999  
0          0.0          0.0          0.0  
1          0.0          0.0          0.0  

[2 rows x 1000 columns]
   comment_0  comment_1  comment_2  comment_3  comment_4  comment_5  \
0        0.0        0.0        0.0        0.0        0.0        0.0   
1        0.0        0.0        0.0    

# model

## RandomForestClassifier

In [35]:
# 3. Maintenant on peut utiliser ces données pour l'entraînement
X_simple_train, X_simple_test, y_simple_train, y_simple_test = train_test_split(X_expanded, Y_simple, test_size=0.2)
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_simple_train, y_simple_train)

# Prédiction et évaluation
y_pred = model_rf.predict(X_simple_test)
accuracy = accuracy_score(y_simple_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.2f}')


Random Forest Accuracy: 0.59


## KNN

In [36]:
# 3. Maintenant on peut utiliser ces données pour l'entraînement
X_simple_train, X_simple_test, y_simple_train, y_simple_test = train_test_split(X_expanded, Y_simple, test_size=0.2)
model_rf = KNeighborsClassifier(n_neighbors=5)
model_rf.fit(X_simple_train, y_simple_train)

# Prédiction et évaluation
y_pred = model_rf.predict(X_simple_test)
accuracy = accuracy_score(y_simple_test, y_pred)
print(f'Random Forest Accuracy: {accuracy:.2f}')


Random Forest Accuracy: 0.57
