In [1]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np

from numpy.lib.scimath import sqrt

from scipy import pi
from scipy.stats import skewnorm
from scipy.optimize import fsolve

In [2]:
load_dotenv()

True

In [3]:
year = int(os.getenv("YEAR"))
year

2020

In [4]:
df = pd.read_csv(f'./out/{year}/data_clean_bert_labels.csv')

In [5]:
# Remove non numeric student grade (Disp, Abs etc.)
df = df[~df['eleve'].str.contains("[a-zA-Z]")]

In [6]:
# Cast grade to float
df = df.astype({'eleve': 'float'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6696 entries, 0 to 6757
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   eleve        6696 non-null   float64
 1   classe       6696 non-null   float64
 2   min          6696 non-null   float64
 3   max          6696 non-null   float64
 4   commentaire  6696 non-null   object 
 5   label        6696 non-null   int64  
 6   score        6696 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 418.5+ KB


In [7]:
# Create skew column (use std of 2 by experience)
STD = 2
df['skew'] = (df['classe'] - (df['max'] + df['min'])/2) / STD
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725
...,...,...,...,...,...,...,...,...
6753,14.16,13.29,8.79,19.05,Un bon trimestre et une bonne implication en c...,4,0.593034,-0.3150
6754,10.94,12.35,5.56,19.42,Ensemble juste convenable. Il faut approfondir...,4,0.512499,-0.0700
6755,10.68,11.34,8.50,18.55,Un trimestre correcte.,3,0.475545,-1.0925
6756,15.38,13.78,9.75,18.75,Bonne implication dans le travail mené en clas...,4,0.565291,-0.2350


In [8]:
# Create delta column
def make_func(sk):
  return lambda delta: sk - (4 - pi)/2 * (delta * sqrt(2/pi))**3 / (1 - 2 * delta**2 / pi)**(3/2)

df['delta'] = df['skew'].apply(lambda x: fsolve(make_func(x) ,1)[0])
df



Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924
...,...,...,...,...,...,...,...,...,...
6753,14.16,13.29,8.79,19.05,Un bon trimestre et une bonne implication en c...,4,0.593034,-0.3150,-0.839461
6754,10.94,12.35,5.56,19.42,Ensemble juste convenable. Il faut approfondir...,4,0.512499,-0.0700,-0.600919
6755,10.68,11.34,8.50,18.55,Un trimestre correcte.,3,0.475545,-1.0925,-1.011129
6756,15.38,13.78,9.75,18.75,Bonne implication dans le travail mené en clas...,4,0.565291,-0.2350,-0.793593


In [9]:
# Create scale column
# We choose a std of 2 by experience
df['scale'] = df['delta'].apply(lambda x: STD / sqrt(1 - 2 * x**2 / pi))
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta,scale
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612,3.013959
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044,2.761004
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329,2.364184
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377,3.225933
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924,2.637196
...,...,...,...,...,...,...,...,...,...,...
6753,14.16,13.29,8.79,19.05,Un bon trimestre et une bonne implication en c...,4,0.593034,-0.3150,-0.839461,2.693428
6754,10.94,12.35,5.56,19.42,Ensemble juste convenable. Il faut approfondir...,4,0.512499,-0.0700,-0.600919,2.279042
6755,10.68,11.34,8.50,18.55,Un trimestre correcte.,3,0.475545,-1.0925,-1.011129,3.384822
6756,15.38,13.78,9.75,18.75,Bonne implication dans le travail mené en clas...,4,0.565291,-0.2350,-0.793593,2.584007


In [10]:
# Create location column
df['location'] = df['classe'] - df['scale'] * df['delta'] * sqrt(2 / pi)
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta,scale,location
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612,3.013959,12.025239
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044,2.761004,16.413456
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329,2.364184,12.130700
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377,3.225933,11.708865
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924,2.637196,10.951046
...,...,...,...,...,...,...,...,...,...,...,...
6753,14.16,13.29,8.79,19.05,Un bon trimestre et une bonne implication en c...,4,0.593034,-0.3150,-0.839461,2.693428,15.094038
6754,10.94,12.35,5.56,19.42,Ensemble juste convenable. Il faut approfondir...,4,0.512499,-0.0700,-0.600919,2.279042,13.442718
6755,10.68,11.34,8.50,18.55,Un trimestre correcte.,3,0.475545,-1.0925,-1.011129,3.384822,14.070755
6756,15.38,13.78,9.75,18.75,Bonne implication dans le travail mené en clas...,4,0.565291,-0.2350,-0.793593,2.584007,15.416181


In [11]:
# Create shape column
# Sometime delta is > 1 in absolute value but d / sqrt(1 - d**2) is always pure real or imaginary
# Also from delta and shape should have the same sign
def find_shape(d):
  if np.abs(d) < 1:
    return np.real(d / sqrt(1 - d**2))
  else:
    return -np.imag(d / sqrt(1 - d**2))
df['shape'] = df.apply(lambda x: find_shape(x['delta']), axis=1)
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta,scale,location,shape
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612,3.013959,12.025239,2.696742
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044,2.761004,16.413456,-1.716358
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329,2.364184,12.130700,-0.898454
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377,3.225933,11.708865,5.415736
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924,2.637196,10.951046,1.416440
...,...,...,...,...,...,...,...,...,...,...,...,...
6753,14.16,13.29,8.79,19.05,Un bon trimestre et une bonne implication en c...,4,0.593034,-0.3150,-0.839461,2.693428,15.094038,-1.544771
6754,10.94,12.35,5.56,19.42,Ensemble juste convenable. Il faut approfondir...,4,0.512499,-0.0700,-0.600919,2.279042,13.442718,-0.751797
6755,10.68,11.34,8.50,18.55,Un trimestre correcte.,3,0.475545,-1.0925,-1.011129,3.384822,14.070755,-6.758492
6756,15.38,13.78,9.75,18.75,Bonne implication dans le travail mené en clas...,4,0.565291,-0.2350,-0.793593,2.584007,15.416181,-1.304288


In [12]:
# Create stars column
def count_stars(student, a, e, w):
    return int((skewnorm.cdf(student, a, e, w) * 4).round()) + 1

df['stars'] = df.apply(lambda x: count_stars(x['eleve'], x['shape'], x['location'], x['scale']), axis=1)
df

Unnamed: 0,eleve,classe,min,max,commentaire,label,score,skew,delta,scale,location,shape,stars
0,17.80,14.28,8.30,17.80,"Un excellent début d'année, poursuivez ainsi !",5,0.756478,0.6150,0.937612,3.013959,12.025239,2.696742,5
1,15.00,14.51,11.00,19.50,Bon ensemble.,4,0.489843,-0.3700,-0.864044,2.761004,16.413456,-1.716358,3
2,10.60,10.87,5.47,16.70,Résultats corrects mais le comportement en cla...,3,0.775267,-0.1075,-0.668329,2.364184,12.130700,-0.898454,3
3,15.50,14.24,6.00,19.00,Bon trimestre. Continuez ainsi !,4,0.485216,0.8700,0.983377,3.225933,11.708865,5.415736,4
4,14.75,12.67,6.00,18.25,Un bon trimestre.,4,0.472740,0.2725,0.816924,2.637196,10.951046,1.416440,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6753,14.16,13.29,8.79,19.05,Un bon trimestre et une bonne implication en c...,4,0.593034,-0.3150,-0.839461,2.693428,15.094038,-1.544771,4
6754,10.94,12.35,5.56,19.42,Ensemble juste convenable. Il faut approfondir...,4,0.512499,-0.0700,-0.600919,2.279042,13.442718,-0.751797,2
6755,10.68,11.34,8.50,18.55,Un trimestre correcte.,3,0.475545,-1.0925,-1.011129,3.384822,14.070755,-6.758492,2
6756,15.38,13.78,9.75,18.75,Bonne implication dans le travail mené en clas...,4,0.565291,-0.2350,-0.793593,2.584007,15.416181,-1.304288,4


## Conclusion

In [13]:
# Let's look at the absolute difference
diff = np.abs(df['label'] - df['stars'])
diff.value_counts()

1    3014
0    2462
2     986
3     192
4      42
dtype: int64

In [14]:
# Remove comments when all grades are close (max - min <= 4) and abs(stars - label) >= 2
# Remove comments when too much disagrement abs(stars - label) == 4
mouchoir = df['max'] - df['min'] <= 4
disagree = diff >= 2
mask = (mouchoir & disagree) | (diff >= 4)
df = df[~mask]
df.shape

(6614, 13)

In [15]:
## We get the same number of stars 40% of the time
(df['label'] == df['stars']).value_counts()

False    4152
True     2462
dtype: int64

In [16]:
f, t = (df['label'] == df['stars']).value_counts()
t / (f + t) * 100

37.224070154218325

In [17]:
# The correlation is positive
df['label'].corr(df['stars'])

0.6398017747078772

In [18]:
diff = np.abs(df['label'] - df['stars'])
diff.describe()

count    6614.000000
mean        0.828243
std         0.774702
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         3.000000
dtype: float64

In [19]:
# 86% of the grades are less than 1 star appart, the mean is 0.75 stars appart
diff.quantile(0.86), diff.quantile(0.87)

(2.0, 2.0)

In [20]:
df.to_csv(f'./out/{year}/data_clean_bert_skew_distrib_stars.csv')