In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import norm

In [None]:
pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_colwidth", 1000)

# Paramètres

In [None]:
ALPHA = 0.05
Z_NORM = norm.ppf(1 - ALPHA / 2)
THEMES = ["Finance", "IT", "RH"]
COMPLEXITIES = ["simple", "intermédiaire", "difficile", "inappropriée"]

# Maille réponse

In [None]:
file_path = "../data/05_anonymised/answers_anonymised.json"
answers = pd.read_json(file_path, orient="records")

## Nettoyage

#### Filtrage des thèmes

In [None]:
themes_to_keep = ["Financial", "HR", "IT"]
where = answers["thème"].isin(themes_to_keep)
answers = answers[where]

#### Renommage des thèmes

In [None]:
answers["thème"] = answers["thème"].replace({"Financial": "Finance", "HR": "RH"})

#### Imputation des complexités manquantes

In [None]:
answers["complexité"] = answers["complexité"].fillna("inappropriée")

# Maille phrase

In [None]:
file_path = "../data/05_anonymised/sentences_anonymised.json"
sentences = pd.read_json(file_path, orient="records")

#### Filtrage des thèmes

In [None]:
themes_to_keep = ["Financial", "HR", "IT"]
where = sentences["thème"].isin(themes_to_keep)
sentences = sentences[where]

#### Renommage des thèmes

In [None]:
sentences["thème"] = sentences["thème"].replace({"Financial": "Finance", "HR": "RH"})

#### Imputation des complexités manquantes

In [None]:
sentences["complexité"] = sentences["complexité"].fillna("inappropriée")

## Taux de langue correcte

In [None]:
column = "answer_is_not_english"
sizes = answers.groupby(["complexité", "thème"])[column].count()
proportions = answers.groupby(["complexité", "thème"])[column].mean()
uncertainties = Z_NORM*np.sqrt(proportions * (1 - proportions) / sizes)

In [None]:
palette = px.colors.sequential.Reds
colors = {complexity: palette[i] for i, complexity in enumerate(COMPLEXITIES)}
fig = go.Figure()
for complexity in COMPLEXITIES:
    fig.add_trace(
        go.Bar(           
            x=proportions[complexity].index,
            y=proportions[complexity],
            name=complexity,
            marker_color=colors[complexity],
            error_y=dict(type="data", array=uncertainties[complexity], visible=True)
        )
    )

fig.update_layout(
    xaxis_title="Thème",
    yaxis_title="Taux de langue correcte",
    font=dict(family="Computer Modern", size=20, color="#7f7f7f"),
)

## Taux de réponse

In [None]:
column = "answers_has_source"
sizes = answers.groupby(["complexité", "thème"])[column].count()
proportions = answers.groupby(["complexité", "thème"])[column].mean()
uncertainties = Z_NORM*np.sqrt(proportions * (1 - proportions) / sizes)

In [None]:
palette = px.colors.sequential.Reds
colors = {complexity: palette[i] for i, complexity in enumerate(COMPLEXITIES)}
fig = go.Figure()
for complexity in COMPLEXITIES:
    fig.add_trace(
        go.Bar(           
            x=proportions[complexity].index,
            y=proportions[complexity],
            name=complexity,
            marker_color=colors[complexity],
            error_y=dict(type="data", array=uncertainties[complexity], visible=True)
        )
    )

fig.update_layout(
    xaxis_title="Thème",
    yaxis_title="Taux de réponse",
    font=dict(family="Computer Modern", size=20, color="#7f7f7f"),
)

## Taux de citations fonctionnelles

In [None]:
column = "source_ids_in_sentences_hallucinated"
sizes = sentences.groupby(["complexité", "thème"])[column].count()
proportions = 1 - sentences.groupby(["complexité", "thème"])[column].mean()
uncertainties = Z_NORM*np.sqrt(proportions * (1 - proportions) / sizes)

In [None]:
palette = px.colors.sequential.Reds
colors = {complexity: palette[i] for i, complexity in enumerate(COMPLEXITIES)}
fig = go.Figure()
for complexity in COMPLEXITIES:
    fig.add_trace(
        go.Bar(           
            x=proportions[complexity].index,
            y=proportions[complexity],
            name=complexity,
            marker_color=colors[complexity],
            error_y=dict(type="data", array=uncertainties[complexity], visible=True)
        )
    )

fig.update_layout(
    xaxis_title="Thème",
    yaxis_title="Taux de citations fonctionnelles",
    font=dict(family="Computer Modern", size=20, color="#7f7f7f"),
)