# Lire les données

In [31]:
import numpy as np 
import pandas as pd
import os
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read dataset
df = pd.read_csv("/kaggle/input/movie-review/movie_review.csv")
df.drop(["fold_id","cv_tag","html_id","sent_id"], axis=1, inplace=True)
df.head()

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos


# Pre-processing des données textuelles

In [32]:
STOPWORDS = set(stopwords.words('english'))
PUNCT_TO_REMOVE = string.punctuation

# Prétraitement des données textuelles
df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(lambda x: " ".join([word for word in str(x).split() if word not in STOPWORDS]))
df['text'] = df['text'].apply(lambda text: text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)))

# Affichage des premières lignes du DataFrame
df.head()

Unnamed: 0,text,tag
0,films adapted comic books plenty success whet...,pos
1,starters created alan moore eddie campbell ...,pos
2,say moore campbell thoroughly researched subje...,pos
3,book graphic novel 500 pages long include...,pos
4,words dismiss film source,pos


# Entraînement du modèle Word2Vec

In [33]:
from gensim.models import Word2Vec

# Tokenization des reviews
tokenized_reviews = [nltk.word_tokenize(review) for review in df['text']]

# Entraînement du modèle Word2Vec
word2vec_model = Word2Vec(tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

# Vectorisation des reviews de movies

In [34]:
reviews_list = df['text'].apply(lambda x: x.split()).tolist()

def moyenne_Word2Vec(review, model, vector_size):
    vectors = []
    for token in review:
        if token in model.wv:
            vectors.append(model.wv[token])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

review_vectors = [moyenne_Word2Vec(tokens, word2vec_model, vector_size=100) for tokens in reviews_list]

# Division des données

In [35]:
X = review_vectors
y = df['tag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Construction d'un classificateur

In [36]:
# Initialisation du modèle Logistic Regression
model = LogisticRegression()

# Entraînement du modèle
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Évaluation du modèle

In [37]:
# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Calcul des métriques d'évaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Affichage des métriques d'évaluation
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.5669035846724351
Precision: 0.5682093704083013
Recall: 0.5669035846724351
F1 Score: 0.561589659012406
