In [1]:
import numpy as np
import pandas as pd

In [2]:
# load the dataset
reviews = pd.read_excel('../data/processed/reviews.xlsx')
reviews.head()

Unnamed: 0,review_id,bank_id,review_date,review_stars,review_text,reviewer_name,reviewer_num_reviews
0,0,608978182545380096,2023-02-27,5.0,Une très belle équipe .\nJe recommande spécial...,anonymous,12.0
1,1,608978182545380096,2022-09-28,5.0,,anonymous,0.0
2,2,608978182545380096,2022-05-27,2.0,,anonymous,49.0
3,3,608978182545380096,2022-04-08,1.0,,anonymous,2.0
4,4,608978182545380096,2022-01-08,1.0,Mauvais service client,anonymous,2.0


In [3]:
from transformers import pipeline

In [4]:
sentiment_pipeline = pipeline('sentiment-analysis', model='nlptown/bert-base-multilingual-uncased-sentiment')

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


The approach we'll take here is the following:

- We know that for a review to be published, it must have either a number of stars or a text comment.
- The model for our sentiment analysis gives us an equivalent number of stars for the comment that we give to it
- We are presented against the following cases
  - If we have a text with stars, we are going to take the average between the stars the model give on the text and the stars the user enters
  - If we only have a text, we are only going to use the stars we get from it through the model
  - If we only have stars, we are only going to use them for the sentiment analysis
- When the final stars score is 4 or more, it's positive, if it's 2 or less, it's negative, otherwise it's a neutral review.
- For the missing values of the text, we are going to fill them with the value 'no text'
- For the missing values of stars, we are going to fill them with the value of 0

In [5]:
# prepare the dataset for sentiment analysis
reviews['review_stars'].isna().sum()

34

In [6]:
# fill missing values
reviews['review_stars'].fillna(0, inplace=True)
reviews['review_text'].fillna('no text', inplace=True)

In [7]:
# define a function for performing the model on the reviews texts
def analyse_text(text):
  # the case of inexistant comment
  if text == 'no text':
    return 0

  # take the first 512 chars from the comment because it's the maximum size the model can handle
  text = text[:512]
  
  # apply model pipeline on text
  result = sentiment_pipeline(text)
  return int(result[0]['label'][0])

In [8]:
# testing the model
analyse_text("j'adore ce produit")

5

In [9]:
# create stars_from_comment column
reviews['stars_from_comment'] = reviews['review_text'].apply(analyse_text)

In [10]:
reviews.head()

Unnamed: 0,review_id,bank_id,review_date,review_stars,review_text,reviewer_name,reviewer_num_reviews,stars_from_comment
0,0,608978182545380096,2023-02-27,5.0,Une très belle équipe .\nJe recommande spécial...,anonymous,12.0,5
1,1,608978182545380096,2022-09-28,5.0,no text,anonymous,0.0,0
2,2,608978182545380096,2022-05-27,2.0,no text,anonymous,49.0,0
3,3,608978182545380096,2022-04-08,1.0,no text,anonymous,2.0,0
4,4,608978182545380096,2022-01-08,1.0,Mauvais service client,anonymous,2.0,1


In [11]:
def get_review_score(row):
  star1 = row['review_stars']
  star2 = row['stars_from_comment']
  if star1 == 0:
    return star2
  if star2 == 0:
    return star1
  return (star1 + star2) / 2

In [12]:
# create review_score column
reviews['review_score'] = reviews.apply(get_review_score, axis=1)

In [13]:
reviews.head()

Unnamed: 0,review_id,bank_id,review_date,review_stars,review_text,reviewer_name,reviewer_num_reviews,stars_from_comment,review_score
0,0,608978182545380096,2023-02-27,5.0,Une très belle équipe .\nJe recommande spécial...,anonymous,12.0,5,5.0
1,1,608978182545380096,2022-09-28,5.0,no text,anonymous,0.0,0,5.0
2,2,608978182545380096,2022-05-27,2.0,no text,anonymous,49.0,0,2.0
3,3,608978182545380096,2022-04-08,1.0,no text,anonymous,2.0,0,1.0
4,4,608978182545380096,2022-01-08,1.0,Mauvais service client,anonymous,2.0,1,1.0


In [14]:
def get_sentiment(score):
  if score <= 2:
    return 'NEGATIVE'
  elif score >= 4:
    return 'POSITIVE'
  return 'NEUTRAL'

In [15]:
# create review_sentiment column
reviews['review_sentiment'] = reviews['review_score'].apply(get_sentiment)

In [16]:
reviews.head()

Unnamed: 0,review_id,bank_id,review_date,review_stars,review_text,reviewer_name,reviewer_num_reviews,stars_from_comment,review_score,review_sentiment
0,0,608978182545380096,2023-02-27,5.0,Une très belle équipe .\nJe recommande spécial...,anonymous,12.0,5,5.0,POSITIVE
1,1,608978182545380096,2022-09-28,5.0,no text,anonymous,0.0,0,5.0,POSITIVE
2,2,608978182545380096,2022-05-27,2.0,no text,anonymous,49.0,0,2.0,NEGATIVE
3,3,608978182545380096,2022-04-08,1.0,no text,anonymous,2.0,0,1.0,NEGATIVE
4,4,608978182545380096,2022-01-08,1.0,Mauvais service client,anonymous,2.0,1,1.0,NEGATIVE


What is left for us to do is to delete the temporary columns we created, and then save the new dataframe as a csv file.

In [17]:
# drop temporary columns
reviews.drop(['stars_from_comment', 'review_score'], axis=1, inplace=True)

In [18]:
# save the new dataframe
reviews.to_excel("../data/processed/reviews_cleaned.xlsx", index=False)