In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

**Reproduce the results**

Simply run this bash script.

```bash
for model in miaou reference-simple reference-experience reference-expert; do
    time python3 gpt.py evaluate $model v0 --csv
done
```

In [48]:
targets = ["albert-light-rag-v0", "albert-light-simple-v0"]

mean_dfs = []
std_dfs = []
bin_dfs = []

# List columns 
columns = ["number_artefacts", "prompt_artefacts", "emails", "urls", "phones", "dates", "hours", "prices_",]

for target in targets:
    df = pd.read_csv(f"../_data/x/{target}/res.csv")
    df = df.drop("id", axis=1)
    features = df.columns
    n_doc = df.shape[0]

    # The mean/std value of "feature" count amongs positive.
    _df = df.copy()
    _df[columns] = df[columns].apply(lambda x: x.apply(lambda y: y if y > 0 else np.nan))
    mean_dfs.append(_df.mean(0))
    std_dfs.append(_df.std(0))

    # The ratio of document that has a "feature"
    _df = df.copy()
    _df[columns] = df[columns].apply(lambda x: x.apply(lambda y: 1 if y > 0 else 0))
    bin_dfs.append(_df.mean(0))


print("Number of document: %d" % n_doc)

Number of document: 500


The results show the mean value in the generated text, for the evaluation set 

* **words**: number of words in the text
* **ttr**: lexical diversity (unique words / words)
* **artefacts**: *number* of prompt artefacts (^---, ^<\w, ^###)
* **emails**: *number* of emails
* **urls**: *number** of url/website
* **phones**: *numbers* of phone number
* **dates**: *number* of date (at least a month given, optionnaly a day or a year)
* **hours**: *number* of hours
* **prices**: *number* of price or ratio
* **repetition**: try to catch if sentence are repeated in a anwser. It could mean the model need more epoch (or finetuning parameters variation ?)
* **3words_repetition**: try to catch if word is repeated three times (or more) in a row. It could mean the model need more epoch (or finetuning parameters variation ?)
* **idk**: when the LLM says he doesn't know !

In [49]:
#
# From Binary values (presence ratio)
#

bin_df = pd.DataFrame(bin_dfs, columns=features, index=[re.sub(r"-test$", "", x) for x in targets])
bin_df.round(2)

Unnamed: 0,words,ttr,emails,urls,phones,dates,hours,prices_,number_artefacts,prompt_artefacts,loop,idk
albert-light-rag-v0,120.35,0.61,0.0,0.04,0.02,0.02,0.0,0.07,0.22,0.0,0.02,0.08
albert-light-simple-v0,209.47,0.44,0.0,0.01,0.02,0.02,0.0,0.04,0.23,0.0,0.05,0.0


In [50]:
#
# From count values (mean on positive)
#
mean_df = pd.DataFrame(mean_dfs, columns=features, index=[re.sub(r"-test$", "", x) for x in targets])
mean_df.round(2)

Unnamed: 0,words,ttr,emails,urls,phones,dates,hours,prices_,number_artefacts,prompt_artefacts,loop,idk
albert-light-rag-v0,120.35,0.61,,1.53,1.91,1.58,4.0,1.76,1.93,,0.02,0.08
albert-light-simple-v0,209.47,0.44,,2.0,3.12,4.12,18.0,4.48,4.42,,0.05,0.0


In [12]:
#
# Mean + std values
#
mean_df = pd.DataFrame(mean_dfs, columns=features, index=[re.sub(r"-test$", "", x) for x in targets])
std_df = pd.DataFrame(std_dfs, columns=features, index=[re.sub(r"-test$", "", x) for x in targets])

df = mean_df.round(2).astype(str) + " ± " + std_df.round(2).astype(str)
df

Unnamed: 0,words,ttr,emails,urls,phones,dates,hours,prices_,number_artefacts,prompt_artefacts,repetition,3word_repetition,idk
albert-light-rag-v0,120.35 ± 94.14,0.61 ± 0.19,0.0 ± 0.0,0.06 ± 0.35,0.04 ± 0.32,0.04 ± 0.26,0.01 ± 0.18,0.12 ± 0.62,0.86 ± 1.91,0.0 ± 0.0,0.05 ± 0.21,0.0 ± 0.0,0.08 ± 0.27
albert-light-simple-v0,209.47 ± 145.98,0.44 ± 0.17,0.0 ± 0.0,0.01 ± 0.19,0.05 ± 0.5,0.07 ± 0.71,0.04 ± 0.8,0.19 ± 1.49,4.41 ± 6.41,0.0 ± 0.0,0.08 ± 0.27,0.0 ± 0.06,0.0 ± 0.0
