In [4]:
import os
from dotenv import load_dotenv
from transformers import pipeline
import pandas as pd

In [2]:
load_dotenv()

True

In [8]:
year = int(os.getenv("YEAR"))
year

2020

In [9]:
classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

In [10]:
classifier(['Aucun effort', "Très bon ensemble"])

[{'label': '1 star', 'score': 0.7614656686782837},
 {'label': '5 stars', 'score': 0.6455514430999756}]

In [11]:
df = pd.read_csv(f'./out/{year}/data_clean.csv')

In [12]:
df.shape

(7114, 5)

## Sentiment analysis

In [13]:
# Create list of dict like {'label': '5 stars', 'score': 0.7564778923988342} for each comment
# LONG TASK ! (45min)
labels = classifier(df['commentaire'].to_list())

In [14]:
# Create a dataframe from the list of dict
df_labels = pd.DataFrame.from_records(labels)
df_labels.shape

(7114, 2)

In [15]:
df_labels.head()

Unnamed: 0,label,score
0,5 stars,0.756478
1,4 stars,0.489843
2,3 stars,0.775267
3,4 stars,0.485216
4,4 stars,0.47274


### Clean the generated dataframe

In [16]:
# Only keep the number of stars in the label
df_labels['label'] = df_labels['label'].str.replace("\s.*", "", regex=True)
df_labels.head()

Unnamed: 0,label,score
0,5,0.756478
1,4,0.489843
2,3,0.775267
3,4,0.485216
4,4,0.47274


In [17]:
# Cast the label column to integer
df_labels['label'] = pd.to_numeric(df_labels['label'])
df_labels.dtypes

label      int64
score    float64
dtype: object

### Save labels

In [18]:
df_complete = pd.concat([df, df_labels], axis=1)
df_complete.head()

Unnamed: 0,eleve,classe,min,max,commentaire,label,score
0,17.8,14.28,8.3,17.8,"Un excellent début d'année, poursuivez ainsi !",5,0.756478
1,15.0,14.51,11.0,19.5,Bon ensemble.,4,0.489843
2,10.6,10.87,5.47,16.7,Résultats corrects mais le comportement en cla...,3,0.775267
3,15.5,14.24,6.0,19.0,Bon trimestre. Continuez ainsi !,4,0.485216
4,14.75,12.67,6.0,18.25,Un bon trimestre.,4,0.47274


### Clean labels

In [19]:
df_complete.describe()
# Mean score of .55 is correct

Unnamed: 0,classe,min,max,label,score
count,7114.0,7114.0,7114.0,7114.0,7114.0
mean,12.19769,5.590322,17.806354,3.590104,0.560866
std,2.088382,3.550014,1.709161,1.006847,0.110026
min,6.07,0.0,11.0,1.0,0.228054
25%,10.91,3.08,16.8,3.0,0.488777
50%,12.13,5.41,18.0,4.0,0.538139
75%,13.34,7.82,19.08,4.0,0.618923
max,20.0,20.0,20.0,5.0,0.95933


In [20]:
# What is the upper score of the 5% worst labels
df_complete['score'].quantile(0.05)

0.4215182900428772

In [21]:
# Remove 5% worst labels
mask = df_complete['score'] > df_complete['score'].quantile(0.05) 
df_complete = df_complete[mask]

In [22]:
df_complete.to_csv(f'./out/{year}/data_clean_bert_labels.csv', index=False)

In [23]:
df_complete.shape

(6758, 7)