#Pre-processing des données


In [20]:
#Librairies
from google.colab import drive
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
file_path = '/content/drive/My Drive/nlp_data/movie_review.csv'
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [7]:
#Normalisation en minuscules les données textuelles
df['text'] = df['text'].str.lower()

In [8]:
#Afficher dataset à nouveau
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [9]:
#Suppression de la Ponctuation


def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punctuation = text.translate(translator)
    return text_without_punctuation


df['text'] = df['text'].apply(remove_punctuation)
df.head()


Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,for starters it was created by alan moore an...,pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,the book or graphic novel if you will is ...,pos
4,0,cv000,29590,4,in other words dont dismiss this film because...,pos


In [10]:
#Télécharger la liste des stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
#Supprimer stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Applique la fonction remove_stopwords à la colonne "text"
df['text'] = df['text'].apply(remove_stopwords)
print(df['text'].head())

0    films adapted comic books plenty success wheth...
1    starters created alan moore eddie campbell bro...
2    say moore campbell thoroughly researched subje...
3    book graphic novel 500 pages long includes nea...
4                       words dont dismiss film source
Name: text, dtype: object


In [12]:
#télécharger les données nécessaires pour utiliser le tokenizer (tokeniseur) de NLTK
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
#Tokenization
def tokenize_text(text):

    tokens = word_tokenize(text)
    return tokens

df['tokens'] = df['text'].apply(tokenize_text)

# Afficher le résultat
print(df[['text', 'tokens']].head())

                                                text  \
0  films adapted comic books plenty success wheth...   
1  starters created alan moore eddie campbell bro...   
2  say moore campbell thoroughly researched subje...   
3  book graphic novel 500 pages long includes nea...   
4                     words dont dismiss film source   

                                              tokens  
0  [films, adapted, comic, books, plenty, success...  
1  [starters, created, alan, moore, eddie, campbe...  
2  [say, moore, campbell, thoroughly, researched,...  
3  [book, graphic, novel, 500, pages, long, inclu...  
4               [words, dont, dismiss, film, source]  


#Entraînement du modèle Word2Vec

In [15]:
model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)


#Vectorisation des reviews de movies

In [17]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector

df['Vector'] = df['tokens'].apply(lambda x: average_word_vectors(x, model, model.wv.index_to_key, 100))


#Division des données



In [23]:
X_train, X_test, y_train, y_test = train_test_split(df['Vector'].tolist(), df['tag'], test_size=0.2, random_state=42)

#Construction d'un classificateur

In [24]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#Évaluation du modèle

In [25]:
#predictions
predictions = model.predict(X_test)

In [26]:
#calcul de accuracy
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")


Accuracy: 0.570225587144623, Precision: 0.571958391346066, Recall: 0.570225587144623, F1 Score: 0.5647957482083156
