# Importation des bibliothèques nécessaires

In [33]:
import numpy as np 
import pandas as pd

# Importation et affichage des données

In [34]:
file_path = "movie_review.csv"
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


# Suppression des colonnes inutiles

In [35]:
df.drop(["fold_id","cv_tag","html_id","sent_id"],axis=1,inplace=True)
df.head()

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos


# Prétraitement des données textuelles

In [36]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [54]:
def preprocess_text_data(data_frame, column_name):
    stop_words = set(stopwords.words('english'))
    punctuation = string.punctuation
    
    
    data_frame[column_name] = data_frame[column_name].str.lower()
    data_frame[column_name] = data_frame[column_name].apply(lambda text: " ".join([word for word in str(text).split() if word not in stop_words]))
    translator = str.maketrans('', '', punctuation)
    data_frame[column_name] = data_frame[column_name].apply(lambda text: text.translate(translator))

preprocess_text_data(df,'text')
df.head()


Unnamed: 0,text,tag
0,films adapted comic books plenty success wheth...,pos
1,starters created alan moore eddie campbell bro...,pos
2,say moore campbell thoroughly researched subje...,pos
3,book graphic novel 500 pages long includes nea...,pos
4,words dismiss film source,pos


# Entraînement du modèle Word2Vec sur les données textuelles

In [40]:
word_lists = df['text'].apply(lambda text: text.split()).tolist()
model = Word2Vec(word_lists, vector_size=100, window=5, min_count=1, workers=4)
print(model.wv)


KeyedVectors<vector_size=100, 47498 keys>


# Vectorisation des reviews de movies

In [42]:
listes_mots = df['text'].apply(lambda x: x.split()).tolist()
def moyenne_Word2Vec(critique, modele, taille_vecteur):
    vecteurs = []
    for token in critique:
        if token in modele.wv:
            vecteurs.append(modele.wv[token])
    if vecteurs:
        return np.mean(vecteurs, axis=0)
    else:
        return np.zeros(taille_vecteur)
    
    
vecteurs_critiques = [moyenne_Word2Vec(tokens, model, 100) for tokens in listes_mots]


# Division des données

In [43]:
from sklearn.model_selection import train_test_split

In [45]:
X = vecteurs_critiques
Y = df['tag']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# Construction d'un classificateur

In [46]:
from sklearn.linear_model import LogisticRegression

In [47]:
reg = LogisticRegression()
reg.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
reg.predict(X_test)

array(['pos', 'pos', 'neg', ..., 'pos', 'pos', 'pos'], dtype=object)

In [50]:
reg.score(X_test,y_test)

0.5634270704573547

# Évaluation du modèle

In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [53]:
y_pred=reg.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)

precision = precision_score(y_test,y_pred,pos_label='pos')  
print("Precision:",precision)

recall = recall_score(y_test,y_pred,pos_label='pos')
print("Recall:",recall)

f1 = f1_score(y_test, y_pred,pos_label='pos')
print("F1-score:",f1)

Accuracy: 0.5634270704573547
Precision: 0.5600595016734846
Recall: 0.682477341389728
F1-score: 0.6152379655477633
