In [None]:
import pandas as pd
import seaborn as sns
import random
import scipy.stats as ss
import numpy as np
import matplotlib.pyplot as plt

In [None]:
BIAIS_SN="biais de somme nulle"
SOMME_ECART_ABSOLU="somme des écarts absolus"

In [None]:
meta_df=pd.DataFrame(columns=["file", "biais_sn_moyen", "biais_sn_std","biais_sn_min","biais_sn_max","biais_sn_med"])

In [None]:
def afficher_histogramme_et_enregistrer_stat(file,meta_df,nom_bias,color=None):
    df = pd.read_csv(file,header=None)
    df.columns=[nom_bias]
    sns.displot(df[nom_bias],kde=True,color=color)
    new_row_meta_df= pd.DataFrame(
            [(file,df[nom_bias].mean(),df[nom_bias].std()
             ,df[nom_bias].min()
            ,df[nom_bias].max()
            ,df[nom_bias].median()
             )],
            columns=["file", "biais_sn_moyen", "biais_sn_std","biais_sn_min","biais_sn_max","biais_sn_med"],
        )
    meta_df=pd.concat([meta_df, new_row_meta_df])
    return meta_df

In [None]:
def afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,nom_bias,color=None):
    for file in liste:
        meta_df=afficher_histogramme_et_enregistrer_stat(file,meta_df,nom_bias,color)
    return meta_df

# Simulations Monte-Carlo

## 1. Scenarios d'ajout cumulés et loi uniforme sur les scores sur un sous-échantillon n tiré aléatoirement de m entités

In [None]:
l=[random.randint(-10,10) for _ in range(100000)]
df=pd.DataFrame(l,columns=['tirage'])

sns.histplot(data=df,x="tirage", stat="percent",discrete=True,color="red")

### 1.1 n=10 ajouts et m=20 entités

In [None]:
DIRECTORY="20_entities_10_samples"
liste=["{}/score_20_10_uniform_220920_clean_{}.csv".format(DIRECTORY,i) for i in range(1,6)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,BIAIS_SN)
liste=["{}/score_20_10_uniform_220920_mea_{}.csv".format(DIRECTORY,i) for i in range(1,6)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,SOMME_ECART_ABSOLU,color="red")

### 1.2 n=100 ajouts et m=20 entités

In [None]:
DIRECTORY="20_entities_100_samples"
liste=["{}/score_20_100_uniform_220918_clean_{}.csv".format(DIRECTORY,i) for i in range(1,2)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,BIAIS_SN)
liste=["{}/score_20_100_uniform_220918_mea_{}.csv".format(DIRECTORY,i) for i in range(1,2)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,SOMME_ECART_ABSOLU,color="red")

In [None]:
liste=["100_entities_10_samples/score_100_10_1_clean.csv",
       "100_entities_10_samples/score_100_10_2_clean.csv",
       "100_entities_10_samples/score_100_10_3_clean.csv",
       "100_entities_10_samples/score_100_10_4_clean.csv",
       "100_entities_10_samples/score_100_10_5_clean.csv"]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,BIAIS_SN)

## 2. Scenarios d'ajout cumulés et loi gaussienne discrète sur les scores sur un sous-échantillon n tiré aléatoirement de m entités

In [None]:
x = np.arange(-10, 11)
xU, xL = x + 0.5, x - 0.5 
prob = ss.norm.cdf(xU, scale = 3) - ss.norm.cdf(xL, scale = 3)
prob = prob / prob.sum() # normalize the probabilities so their sum is 1
nums = np.random.choice(x, size = 10000, p = prob)
plt.hist(nums, bins = len(x))

### 2.1 n=10 ajouts et m=20 entités

In [None]:
DIRECTORY="20_entities_10_samples"
liste=["{}/score_20_10_uniform_220920_clean_{}.csv".format(DIRECTORY,i) for i in range(1,6)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,BIAIS_SN)
liste=["{}/score_20_10_uniform_220920_mea_{}.csv".format(DIRECTORY,i) for i in range(1,6)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,SOMME_ECART_ABSOLU,color="red")

### 2.2 n=100 ajouts et m=20 entités

In [None]:
DIRECTORY="20_entities_100_samples"
liste=["{}/score_20_100_gaussian_220916_clean_{}.csv".format(DIRECTORY,i) for i in range(1,2)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,BIAIS_SN)
liste=["{}/score_20_100_gaussian_220916_mea_{}.csv".format(DIRECTORY,i) for i in range(1,2)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,SOMME_ECART_ABSOLU,color="red")

## 3. Scenarios d'ajout cumulés et de deux loi gaussiennes discrètes centrées sur les bornes sur les scores sur un sous-échantillon n tiré aléatoirement de m entités

In [None]:
x = np.arange(-10, 11)
prob= np.array([0.13242928, 0.12533694,
 0.1062586,  0.08069342, 0.05489084, 0.03344625, 0.01825486, 0.00892463,
 0.00390822, 0.00153299,
0.00053861, 0.00153299, 0.00390822, 0.00892463, 0.01825486, 0.03344625,
 0.05489084, 0.08069342, 0.1062586,  0.12533694, 0.13242928])
prob = prob / prob.sum()
nums = np.random.choice(x, size = 10000, p = prob)
plt.hist(nums, bins = len(x))

### 3.1 n=10 ajouts et m=20 entités

In [None]:
DIRECTORY="20_entities_10_samples"
liste=["{}/score_20_10_invgaussian_220920_clean_{}.csv".format(DIRECTORY,i) for i in range(1,6)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,BIAIS_SN)
liste=["{}/score_20_10_invgaussian_220920_mea_{}.csv".format(DIRECTORY,i) for i in range(1,6)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,SOMME_ECART_ABSOLU,color="red")

### 3.2 n=100 ajouts et m=20 entités

In [None]:
DIRECTORY="20_entities_100_samples"
liste=["{}/score_20_100_invgaussian_220911_clean_{}.csv".format(DIRECTORY,i) for i in range(1,2)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,BIAIS_SN)
liste=["{}/score_20_100_invgaussian_220911_mea_{}.csv".format(DIRECTORY,i) for i in range(1,2)]
meta_df=afficher_histogramme_et_enregistrer_stat_pour_liste(liste,meta_df,SOMME_ECART_ABSOLU,color="red")

In [None]:
meta_df

In [None]:
def print_histo(name):
    df=pd.read_csv(name,
               header=None,names=["mea_score","mea_score_sub","mea_score_on","mea_unc","mea_unc_sub","mea_unc_on","nombre"])
    df_mea=df[["mea_score","mea_score_sub","mea_score_on"]]
    sns.histplot(data=df_mea.melt(),x="value",hue="variable",multiple="dodge",kde=True).set(title=name)
    plt.figure()
    plt.clf()

In [None]:
BASE_DIR="20_entities_10_samples"

for i in range(1,6):
    print_histo("{}/score_20_10_uniform_220926_meaplus_{}.csv".format(BASE_DIR,i))
    
for i in range(1,6):
    print_histo("{}/score_20_10_gaussian_220926_meaplus_{}.csv".format(BASE_DIR,i))
    
for i in range(1,6):
    print_histo("{}/score_20_10_invgaussian_220925_meaplus_{}.csv".format(BASE_DIR,i))

In [None]:
df_mea=df[["mea_unc","mea_unc_sub","mea_unc_on"]]
sns.histplot(data=df_mea.melt(),x="value",hue="variable")