In [6]:
from transformers import pipeline
import pandas as pd

In [None]:
classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

In [16]:
classifier(['Aucun effort', "Très bon ensemble"])

[{'label': '1 star', 'score': 0.7614656686782837},
 {'label': '5 stars', 'score': 0.6455514430999756}]

In [14]:
df = pd.read_csv('../../data_clean.csv')

## Sentiment analysis

In [48]:
# Create list of dict like {'label': '5 stars', 'score': 0.7564778923988342} for each comment
# LONG TASK ! (45min)
labels = classifier(df['commentaire'].to_list())

In [49]:
# Create a datafram from the list of dict
df_labels = pd.DataFrame.from_records(labels)
df_labels.shape

(7493, 2)

In [50]:
df_labels.head()

Unnamed: 0,label,score
0,5 stars,0.756478
1,4 stars,0.489843
2,3 stars,0.775267
3,4 stars,0.485216
4,4 stars,0.47274


### Clean the generated dataframe

In [51]:
# Only keep the number of stars in the label
df_labels['label'] = df_labels['label'].str.replace("\s.*", "", regex=True)
df_labels.head()

Unnamed: 0,label,score
0,5,0.756478
1,4,0.489843
2,3,0.775267
3,4,0.485216
4,4,0.47274


In [52]:
# Cast the label column to integer
df_labels['label'] = pd.to_numeric(df_labels['label'])
df_labels.dtypes

label      int64
score    float64
dtype: object

### Save labels

In [55]:
df_labels.to_csv('./data_labels.csv', index=False)

In [60]:
df_complete = pd.concat([df, df_labels], axis=1)
df_complete.head()

Unnamed: 0,eleve,classe,min,max,commentaire,label,score
0,17.8,14.28,8.3,17.8,"Un excellent début d'année, poursuivez ainsi !",5,0.756478
1,15.0,14.51,11.0,19.5,Bon ensemble.,4,0.489843
2,10.6,10.87,5.47,16.7,Résultats corrects mais le comportement en cla...,3,0.775267
3,15.5,14.24,6.0,19.0,Bon trimestre. Continuez ainsi !,4,0.485216
4,14.75,12.67,6.0,18.25,Un bon trimestre.,4,0.47274


In [62]:
df_complete.to_csv('./data_clean_with_labels.csv', index=False)

### Analyse labels

In [65]:
df_complete.describe()
# Mean score of .55 is correct

Unnamed: 0,classe,min,max,label,score
count,7493.0,7493.0,7493.0,7493.0,7493.0
mean,12.129732,5.467023,17.829442,3.571867,0.555552
std,2.093264,3.572483,1.703716,1.020384,0.108942
min,6.07,0.0,11.0,1.0,0.228054
25%,10.83,3.0,16.83,3.0,0.484488
50%,12.09,5.25,18.0,4.0,0.534053
75%,13.32,7.69,19.14,4.0,0.610181
max,20.0,20.0,20.0,5.0,0.944432


In [70]:
# What is the upper score of the 5% worst labels
df_complete['score'].quantile(0.05)

0.4131753027439118

In [79]:
# Remove 5% worst labels
mask = df_complete['score'] > df_complete['score'].quantile(0.05) 
df_complete = df_complete[mask]

In [91]:
df_complete.to_csv('./data_with_clean_labels.csv', index=False)