In [90]:
import pandas as pd
import numpy as np

from numpy.lib.scimath import sqrt

from scipy import pi
from scipy.stats import skewnorm
from scipy.optimize import fsolve

In [3]:
df = pd.read_csv('../bert/data_with_clean_labels.csv')

In [64]:
# Remove non numeric student grade (Disp, Abs etc.)
df = df[~df['eleve'].str.contains("[a-zA-Z]")]

In [70]:
# Cast grade to float
df = df.astype({'eleve': 'float'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7055 entries, 0 to 7117
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   eleve        7055 non-null   float64
 1   classe       7055 non-null   float64
 2   min          7055 non-null   float64
 3   max          7055 non-null   float64
 4   commentaire  7055 non-null   object 
 5   label        7055 non-null   int64  
 6   score        7055 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 699.0+ KB


In [80]:
# Create skew column (use std of 2 by experience)
STD = 2
df['skew'] = (df['classe'] - (df['max'] + df['min'])/2) / STD
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725
...,...,...,...,...,...,...,...,...
7113,6.00,9.06,0.00,19.50,"L'ensemble, en recul, est fragile. Beaucoup de...",2,0.438044,-0.3450
7114,11.50,9.06,0.00,19.50,L'ensemble demeure honorable. Toujours beaucou...,5,0.522465,-0.3450
7115,6.00,9.06,0.00,19.50,L'ensemble demeure fragile malgré le sérieux.,2,0.501147,-0.3450
7116,17.00,9.06,0.00,19.50,L'ensemble demeure solide.,4,0.487701,-0.3450


In [81]:
# Create delta column
def make_func(sk):
  return lambda delta: sk - (4 - pi)/2 * (delta * sqrt(2/pi))**3 / (1 - 2 * delta**2 / pi)**(3/2)

df['delta'] = df['skew'].apply(lambda x: fsolve(make_func(x) ,1)[0])
df



Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924
...,...,...,...,...,...,...,...,...,...
7113,6.00,9.06,0.00,19.50,"L'ensemble, en recul, est fragile. Beaucoup de...",2,0.438044,-0.3450,-0.853420
7114,11.50,9.06,0.00,19.50,L'ensemble demeure honorable. Toujours beaucou...,5,0.522465,-0.3450,-0.853420
7115,6.00,9.06,0.00,19.50,L'ensemble demeure fragile malgré le sérieux.,2,0.501147,-0.3450,-0.853420
7116,17.00,9.06,0.00,19.50,L'ensemble demeure solide.,4,0.487701,-0.3450,-0.853420


In [82]:
# Create scale column
# We choose a std of 2 by experience
df['scale'] = df['delta'].apply(lambda x: STD / sqrt(1 - 2 * x**2 / pi))
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta,scale
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612,3.013959
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044,2.761004
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329,2.364184
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377,3.225933
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924,2.637196
...,...,...,...,...,...,...,...,...,...,...
7113,6.00,9.06,0.00,19.50,"L'ensemble, en recul, est fragile. Beaucoup de...",2,0.438044,-0.3450,-0.853420,2.730942
7114,11.50,9.06,0.00,19.50,L'ensemble demeure honorable. Toujours beaucou...,5,0.522465,-0.3450,-0.853420,2.730942
7115,6.00,9.06,0.00,19.50,L'ensemble demeure fragile malgré le sérieux.,2,0.501147,-0.3450,-0.853420,2.730942
7116,17.00,9.06,0.00,19.50,L'ensemble demeure solide.,4,0.487701,-0.3450,-0.853420,2.730942


In [94]:
# Create location column
df['location'] = df['classe'] - df['scale'] * df['delta'] * sqrt(2 / pi)
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta,scale,location,shape,stars
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612,3.013959,12.025239,2.696742+0.000000j,5.0
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044,2.761004,16.413456,-1.716358+0.000000j,3.0
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329,2.364184,12.130700,-0.898454+0.000000j,2.0
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377,3.225933,11.708865,5.415736+0.000000j,4.0
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924,2.637196,10.951046,1.416440+0.000000j,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7113,6.00,9.06,0.00,19.50,"L'ensemble, en recul, est fragile. Beaucoup de...",2,0.438044,-0.3450,-0.853420,2.730942,10.919582,-1.637337+0.000000j,0.0
7114,11.50,9.06,0.00,19.50,L'ensemble demeure honorable. Toujours beaucou...,5,0.522465,-0.3450,-0.853420,2.730942,10.919582,-1.637337+0.000000j,4.0
7115,6.00,9.06,0.00,19.50,L'ensemble demeure fragile malgré le sérieux.,2,0.501147,-0.3450,-0.853420,2.730942,10.919582,-1.637337+0.000000j,0.0
7116,17.00,9.06,0.00,19.50,L'ensemble demeure solide.,4,0.487701,-0.3450,-0.853420,2.730942,10.919582,-1.637337+0.000000j,5.0


In [182]:
# Create shape column
# Sometime delta is > 1 in absolute value but d / sqrt(1 - d**2) is always pure real or imaginary
# Also from delta and shape should have the same sign
def find_shape(d):
  if np.abs(d) < 1:
    return np.real(d / sqrt(1 - d**2))
  else:
    return -np.imag(d / sqrt(1 - d**2))
df['shape'] = df.apply(lambda x: find_shape(x['delta']), axis=1)
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta,scale,location,shape,stars
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612,3.013959,12.025239,2.696742,5
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044,2.761004,16.413456,-1.716358,3
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329,2.364184,12.130700,-0.898454,3
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377,3.225933,11.708865,5.415736,4
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924,2.637196,10.951046,1.416440,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7113,6.00,9.06,0.00,19.50,"L'ensemble, en recul, est fragile. Beaucoup de...",2,0.438044,-0.3450,-0.853420,2.730942,10.919582,-1.637337,1
7114,11.50,9.06,0.00,19.50,L'ensemble demeure honorable. Toujours beaucou...,5,0.522465,-0.3450,-0.853420,2.730942,10.919582,-1.637337,5
7115,6.00,9.06,0.00,19.50,L'ensemble demeure fragile malgré le sérieux.,2,0.501147,-0.3450,-0.853420,2.730942,10.919582,-1.637337,1
7116,17.00,9.06,0.00,19.50,L'ensemble demeure solide.,4,0.487701,-0.3450,-0.853420,2.730942,10.919582,-1.637337,5


In [184]:
# Create stars column
def count_stars(student, a, e, w):
    return int((skewnorm.cdf(student, a, e, w) * 4).round()) + 1

df['stars'] = df.apply(lambda x: count_stars(x['eleve'], x['shape'], x['location'], x['scale']), axis=1)
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta,scale,location,shape,stars
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612,3.013959,12.025239,2.696742,5
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044,2.761004,16.413456,-1.716358,3
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329,2.364184,12.130700,-0.898454,3
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377,3.225933,11.708865,5.415736,4
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924,2.637196,10.951046,1.416440,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7113,6.00,9.06,0.00,19.50,"L'ensemble, en recul, est fragile. Beaucoup de...",2,0.438044,-0.3450,-0.853420,2.730942,10.919582,-1.637337,1
7114,11.50,9.06,0.00,19.50,L'ensemble demeure honorable. Toujours beaucou...,5,0.522465,-0.3450,-0.853420,2.730942,10.919582,-1.637337,5
7115,6.00,9.06,0.00,19.50,L'ensemble demeure fragile malgré le sérieux.,2,0.501147,-0.3450,-0.853420,2.730942,10.919582,-1.637337,1
7116,17.00,9.06,0.00,19.50,L'ensemble demeure solide.,4,0.487701,-0.3450,-0.853420,2.730942,10.919582,-1.637337,5


## Conclusion

In [229]:
# Let's look at the absolute difference
diff = np.abs(df['label'] - df['stars'])
diff

0       0
1       1
2       0
3       0
4       0
       ..
7113    1
7114    0
7115    1
7116    1
7117    0
Length: 7012, dtype: int64

In [233]:
# Remove comments when all grades are close (max - min <= 4) and abs(stars - label) >= 2
# Remove comments when too much disagrement abs(stars - label) == 4
mouchoir = df['max'] - df['min'] <= 4
disagree = diff >= 2
mask = (mouchoir & disagree) | (diff == 4)
df = df[~mask]
df.shape

  df = df[~mask]


(6964, 13)

In [234]:
## We get the same number of stars 37% of the time
(df['label'] == df['stars']).value_counts()

False    4375
True     2589
dtype: int64

In [235]:
f, t = (df['label'] == df['stars']).value_counts()
t / (f + t) * 100

37.176909821941415

In [236]:
# The correlation is positive
df['label'].corr(df['stars'])

0.6439313032422188

In [238]:
diff = np.abs(df['label'] - df['stars'])
diff.describe()

count    6964.000000
mean        0.827829
std         0.773555
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         3.000000
dtype: float64

In [239]:
# About 82% of the grades are less than 1 star appart, the mean is 0.83 stars appart
diff.quantile(0.82), diff.quantile(0.83)

(1.0, 2.0)

In [241]:
df.to_csv('./bert_and_skew_stars_clean.csv')