# Notebook to plot metrics 

To plot :
- metrics per pdf from `data/output/all_metrics_per_pdf.csv`
- metrics per indicator from `data/output/all_metrics_per_indic.csv`   

and compute the average metrics.  

The different metrics under study are :
- the accuracy 
- the precision
- the recall
- the number of true positives (TP)   
    A pred value is counted as a TP when it is a number equal to the true value
- the number of true negatives (TN) 
    A pred value is counted as a TN when it is np.nan (ie "je ne trouve pas") and the true value is also np.nan (ie no answer in the pdf)
- the number of false positives of kind 1 (FP1) 
    A pred value is counted as a FP1 when it is a number while the true value is np.nan (ie no answer in the pdf)
- the number of false positives of kind 2 (FP2) 
    A pred value is counted as a FP2 when it is a number while the true value is another number
- the number of false negatives (FN) 
    A pred value is counted as a FN when it is np.nan (ie "je ne trouve pas") while the true value is a number

All metrics can be calculated in principle with respect to "true" PDF values or SISPEA values (if SISPEA values are available).  
It turns out that true SISPEA values differ significantly from true PDF values, hence it is not relevant to use SISPEA values as the true reference for computing metrics.

### Import modules

In [1]:
import pandas as pd
import plotly.express as px
import sys
sys.path.append("../")    # Add the path to the root directory (where we can find the folder narval/)

%load_ext autoreload
%autoreload 2 

from narval.utils import get_data_dir, FileSystem
from narval.metrics import MetricsCalculator

  machar = _get_machar(dtype)


In [2]:
fs = FileSystem()
dir = get_data_dir()

### Import the metrics calculator

In [3]:
metrics_calc = MetricsCalculator()

### Compute average metrics

In [4]:
mean_df = metrics_calc.compute_average_metrics()

# Show
mean_df = mean_df.sort_values(by="benchmark_version", key=lambda s: [int(x.split("_")[-1]) for x in s])
ordered_cols_to_keep = [
    'benchmark_version', 
    'pdf_list_file', 
    'no_value_indic_rate_in_pdf', 
    'avg_accuracy_vs_pdf', 
    'avg_precision_vs_pdf', 
    'avg_recall_vs_pdf', 
    'tp_rate_vs_pdf',
    'tn_rate_vs_pdf', 
    'fp1_rate_vs_pdf', 
    'fp2_rate_vs_pdf',
    'fn_rate_vs_pdf'
]
mean_df = mean_df[ordered_cols_to_keep]
mean_df



Unnamed: 0,benchmark_version,pdf_list_file,no_value_indic_rate_in_pdf,avg_accuracy_vs_pdf,avg_precision_vs_pdf,avg_recall_vs_pdf,tp_rate_vs_pdf,tn_rate_vs_pdf,fp1_rate_vs_pdf,fp2_rate_vs_pdf,fn_rate_vs_pdf
35,benchmark_4,rpqs_eval_list_1.csv,0.382456,0.414035,0.315271,0.363636,0.224561,0.189474,0.192982,0.294737,0.098246
36,benchmark_5,rpqs_eval_list_1.csv,0.382456,0.768421,0.786982,0.755682,0.466667,0.301754,0.080702,0.045614,0.105263
37,benchmark_6,rpqs_eval_list_1.csv,0.382456,0.824561,0.813187,0.840909,0.519298,0.305263,0.077193,0.042105,0.05614
38,benchmark_7,rpqs_eval_list_1.csv,0.382456,0.775439,0.736842,0.875,0.540351,0.235088,0.147368,0.045614,0.031579
39,benchmark_8,rpqs_eval_list_1.csv,0.382456,0.757895,0.716981,0.863636,0.533333,0.224561,0.157895,0.052632,0.031579
40,benchmark_9,rpqs_eval_list_1.csv,0.382456,0.814035,0.78,0.886364,0.547368,0.266667,0.115789,0.038596,0.031579
0,benchmark_10,rpqs_eval_list_1.csv,0.382456,0.85614,0.84153,0.875,0.540351,0.315789,0.066667,0.035088,0.042105
1,benchmark_11,rpqs_eval_list_1.csv,0.382456,0.835088,0.834254,0.857955,0.529825,0.305263,0.077193,0.02807,0.059649
2,benchmark_12,rpqs_eval_list_1.csv,0.382456,0.831579,0.818681,0.846591,0.522807,0.308772,0.073684,0.042105,0.052632
3,benchmark_13,rpqs_eval_list_1.csv,0.382456,0.810526,0.769608,0.892045,0.550877,0.259649,0.122807,0.042105,0.024561


In [6]:
benchmark_list = ["benchmark_27", "benchmark_32", "benchmark_table_32"]

mean_df[mean_df["benchmark_version"].isin(benchmark_list)].sort_values(by=["benchmark_version", "pdf_list_file"])

Unnamed: 0,benchmark_version,pdf_list_file,no_value_indic_rate_in_pdf,avg_accuracy_vs_pdf,avg_precision_vs_pdf,avg_recall_vs_pdf,tp_rate_vs_pdf,tn_rate_vs_pdf,fp1_rate_vs_pdf,fp2_rate_vs_pdf,fn_rate_vs_pdf
17,benchmark_27,rpqs_eval_list_1+2.csv,0.492398,0.860819,0.841346,0.806452,0.409357,0.451462,0.040936,0.036257,0.061988
18,benchmark_27,rpqs_eval_list_1.csv,0.382456,0.929825,0.952096,0.903409,0.557895,0.37193,0.010526,0.017544,0.042105
19,benchmark_27,rpqs_eval_list_2.csv,0.547368,0.826316,0.767068,0.74031,0.335088,0.491228,0.05614,0.045614,0.07193
32,benchmark_32,rpqs_eval_list_1+2.csv,0.492398,0.899415,0.907731,0.83871,0.425731,0.473684,0.018713,0.024561,0.05731
33,benchmark_32,rpqs_eval_list_1.csv,0.382456,0.936842,0.958333,0.914773,0.564912,0.37193,0.010526,0.014035,0.038596
34,benchmark_32,rpqs_eval_list_2.csv,0.547368,0.880702,0.871245,0.786822,0.35614,0.524561,0.022807,0.029825,0.066667
47,benchmark_table_32,rpqs_eval_list_1+2.csv,0.492398,0.734503,1.0,0.476959,0.242105,0.492398,0.0,0.0,0.265497
48,benchmark_table_32,rpqs_eval_list_1.csv,0.382456,0.740351,1.0,0.579545,0.357895,0.382456,0.0,0.0,0.259649
49,benchmark_table_32,rpqs_eval_list_2.csv,0.547368,0.731579,1.0,0.406977,0.184211,0.547368,0.0,0.0,0.268421


In [None]:
# Choose the benchmark version to be studied
pdf_list_file = "rpqs_eval_list_1+2.csv"
benchmark_list = ["benchmark_32", "benchmark_table_32"]

# Select data in the metrics dataframe
data_to_plot = (mean_df
                .query("pdf_list_file==@pdf_list_file")
                .query("benchmark_version.isin(@benchmark_list)")
                )

# Rename columns
new_names_dict = {
    "tp_rate_vs_pdf": "True positives", 
    "tn_rate_vs_pdf": "True negatives", 
    "fp1_rate_vs_pdf": "False positives of kind 1", 
    "fp2_rate_vs_pdf" : "False positives of kind 2", 
    "fn_rate_vs_pdf": "False negatives"
}
data_to_plot = data_to_plot.rename(columns=new_names_dict)

# Plot
cols_to_show = list(new_names_dict.values())
fig = px.bar(data_to_plot, 
                y="benchmark_version", 
                x=cols_to_show,  
                category_orders={"variable": cols_to_show},
                color_discrete_sequence=[px.colors.qualitative.Plotly[i] for i in [0, 5, 1, 6, 4]],
                height=300,
                width=800,
                title=f"Average metrics for {pdf_list_file}"
                )

fig.update(layout_xaxis_range = [0,1], layout_xaxis_title="Rate", layout_xaxis_tickformat = ',.0%')
fig.show()

### Plot metrics per pdf

Get the metrics dataframe

In [7]:
df_per_pdf = metrics_calc.df_per_pdf
df_per_pdf.head()

Unnamed: 0,benchmark_version,year,competence,pdf_name,no_value_indic_nb_in_sispea,accuracy_vs_sispea,precision_vs_sispea,recall_vs_sispea,tp_nb_vs_sispea,tn_nb_vs_sispea,...,fn_nb_vs_sispea,no_value_indic_nb_in_pdf,accuracy_vs_pdf,precision_vs_pdf,recall_vs_pdf,tp_nb_vs_pdf,tn_nb_vs_pdf,fp1_nb_vs_pdf,fp2_nb_vs_pdf,fn_nb_vs_pdf
0,benchmark_10,2021,assainissement collectif,RPQS_Abainville_AC_2021.pdf,9.0,0.894737,0.833333,1.0,10.0,7.0,...,0.0,9,0.894737,0.833333,1.0,10,7,2,0,0
1,benchmark_10,2021,assainissement collectif,RPQS_Allain_AC_2021.pdf,4.0,0.526316,0.727273,0.533333,8.0,2.0,...,6.0,10,0.894737,0.818182,1.0,9,8,2,0,0
2,benchmark_10,2021,assainissement collectif,RPQS_Alloue_AC_2021.pdf,8.0,0.947368,0.916667,1.0,11.0,7.0,...,0.0,8,0.947368,0.916667,1.0,11,7,1,0,0
3,benchmark_10,2021,assainissement collectif,RPQS_Aubignosc_AC_2021.pdf,1.0,0.105263,0.25,0.055556,1.0,1.0,...,14.0,16,0.842105,0.25,0.333333,1,15,1,2,0
4,benchmark_10,2021,assainissement collectif,RPQS_Autun_AC_2021.pdf,0.0,0.578947,0.611111,0.578947,11.0,0.0,...,1.0,4,0.842105,0.833333,1.0,15,1,3,0,0


Plot the accuracy per pdf

In [None]:
# Choose the benchmark versions to be studied
benchmark_list = ["benchmark_32", "benchmark_table_32"]

# Choose the benchmark version fixing PDF order (descending accuracy)
pdf_order = "benchmark_32"

# Select data in the metrics dataframe
data_to_plot = df_per_pdf[['benchmark_version', 'pdf_name', 'accuracy_vs_sispea', 'accuracy_vs_pdf']]
data_to_plot = data_to_plot[data_to_plot["benchmark_version"].isin(benchmark_list)]

# Order PDF by accuracy
pdf_order_list = (df_per_pdf[df_per_pdf["benchmark_version"]==pdf_order]
                  .sort_values(by="accuracy_vs_pdf", ascending=True)["pdf_name"]
                  .to_list())
data_to_plot['pdf_name'] = pd.Categorical(data_to_plot['pdf_name'], categories=pdf_order_list, ordered=True)
data_to_plot = data_to_plot.sort_values("pdf_name")

# Simplify PDF names
data_to_plot['pdf_name'] = data_to_plot['pdf_name'].str.replace(r"_rpqsid_\d+|_cp\d+", "", regex=True)

# Choose color order 
benchmark_order_list = data_to_plot["benchmark_version"].unique().tolist()
benchmark_order_list.sort(key=lambda x: int(x.split("_")[-1]))
benchmark_nb_order_list = [int(x.split("_")[-1]) for x in benchmark_order_list]

# Plot
#for tag in ["vs_sispea", "vs_pdf"]:
for tag in ["vs_pdf"]:
    fig = px.bar(data_to_plot, 
                y="pdf_name", 
                x="accuracy_"+tag, 
                color="benchmark_version", 
                category_orders={"benchmark_version": reversed(benchmark_order_list)},
                #color_discrete_sequence=[px.colors.qualitative.Plotly[x%10] for x in benchmark_nb_order_list],
                height=1200,  #600,
                width=800,
                barmode='group',
                title=f"Accuracy per pdf ({tag})")
    fig.layout.xaxis.tickformat = ',.0%'
    fig.update(layout_xaxis_range = [0,1], layout_xaxis_title="Accuracy")
    fig.show()

Plot per pdf the number of  
- true positives (TP).   
- true negatives (TN). 
- false positives of kind 1 (FP1).
- false positives of kind 2 (FP2).
- false negatives (FN).
    

In [70]:
# Choose the benchmark version to be studied
benchmark_list = ["benchmark_32", "benchmark_table_32"]

# Choose the benchmark version fixing PDF order (descending accuracy)
pdf_order = "benchmark_32"

# Order PDF
pdf_order_list = (df_per_pdf[df_per_pdf["benchmark_version"]==pdf_order]
                  .sort_values(by="accuracy_vs_pdf", ascending=True)["pdf_name"]
                  .to_list())

for benchmark_version in benchmark_list:
    # Select data in the metrics dataframe
    data_to_plot = df_per_pdf[df_per_pdf["benchmark_version"]==benchmark_version]

    # Order PDF
    data_to_plot = data_to_plot.copy()
    data_to_plot['pdf_name'] = pd.Categorical(data_to_plot['pdf_name'], categories=pdf_order_list, ordered=True)
    data_to_plot = data_to_plot.sort_values("pdf_name")

    # Simplify PDF names
    data_to_plot['pdf_name'] = data_to_plot['pdf_name'].str.replace(r"_rpqsid_\d+|_cp\d+", "", regex=True)

    # Rename columns
    new_names_dict = {
        "tp_nb_vs_pdf": "True positives", 
        "tn_nb_vs_pdf": "True negatives", 
        "fp1_nb_vs_pdf": "False positives of kind 1", 
        "fp2_nb_vs_pdf" : "False positives of kind 2", 
        "fn_nb_vs_pdf": "False negatives"
    }
    data_to_plot = data_to_plot.rename(columns=new_names_dict)

    # Plot
    cols_to_show = list(new_names_dict.values())
    fig = px.bar(data_to_plot, 
                    y="pdf_name", 
                    x=cols_to_show,  
                    category_orders={"variable": cols_to_show},
                    #color_discrete_sequence=[px.colors.qualitative.G10[i] for i in [0, 5, 1, 2, 4]],
                    color_discrete_sequence=[px.colors.qualitative.Plotly[i] for i in [0, 5, 1, 6, 4]],
                    height=900,
                    width=800,
                    title=f"Metrics per pdf for {benchmark_version}"
                    )

    fig.show()



### Plot metrics per indicator

Get the metrics dataframe

In [32]:
df_per_ind = metrics_calc.df_per_indic
df_per_ind.head()

Unnamed: 0,benchmark_version,pdf_list_file,indicator,no_value_indic_nb_in_sispea,accuracy_vs_sispea,precision_vs_sispea,recall_vs_sispea,tp_nb_vs_sispea,tn_nb_vs_sispea,fp1_nb_vs_sispea,...,fn_nb_vs_sispea,no_value_indic_nb_in_pdf,accuracy_vs_pdf,precision_vs_pdf,recall_vs_pdf,tp_nb_vs_pdf,tn_nb_vs_pdf,fp1_nb_vs_pdf,fp2_nb_vs_pdf,fn_nb_vs_pdf
0,benchmark_4,rpqs_eval_list_1.csv,D201.0,0.0,0.533333,0.571429,0.533333,8.0,0.0,0.0,...,1.0,0.0,0.6,0.642857,0.6,9.0,0.0,0.0,5.0,1.0
1,benchmark_4,rpqs_eval_list_1.csv,D202.0,0.0,0.533333,0.571429,0.533333,8.0,0.0,0.0,...,1.0,1.0,0.533333,0.571429,0.571429,8.0,0.0,1.0,5.0,1.0
2,benchmark_4,rpqs_eval_list_1.csv,D203.0,2.0,0.333333,0.363636,0.307692,4.0,1.0,1.0,...,3.0,5.0,0.466667,0.363636,0.4,4.0,3.0,2.0,5.0,1.0
3,benchmark_4,rpqs_eval_list_1.csv,D204.0,0.0,0.066667,0.090909,0.066667,1.0,0.0,0.0,...,4.0,1.0,0.066667,0.090909,0.071429,1.0,0.0,1.0,9.0,4.0
4,benchmark_4,rpqs_eval_list_1.csv,P201.1,3.0,0.2,0.230769,0.25,3.0,0.0,3.0,...,2.0,5.0,0.4,0.384615,0.5,5.0,1.0,4.0,4.0,1.0


Plot the accuracy per indicator

In [69]:
# Choose the PDF set used in the calculation of metrics per indicator
pdf_list_file = 'rpqs_eval_list_1+2.csv'
# Choose which benchmark versions the metrics should be calculated for
benchmark_list = ["benchmark_32", "benchmark_table_32"]
# Choose the benchmark version fixing indicator order (descending accuracy)
indic_order = "benchmark_32"

# Select data to plot
data_to_plot = df_per_ind[df_per_ind["pdf_list_file"]==pdf_list_file]
data_to_plot = data_to_plot[data_to_plot["benchmark_version"].isin(benchmark_list)]

# Order indicator by accuracy
indic_order_list = (data_to_plot[data_to_plot["benchmark_version"]==indic_order]
                  .sort_values(by="accuracy_vs_pdf", ascending=True)["indicator"]
                  .to_list())
data_to_plot['indicator'] = pd.Categorical(data_to_plot['indicator'], categories=indic_order_list, ordered=True)
data_to_plot = data_to_plot.sort_values("indicator")

# Define color order 
benchmark_order_list = data_to_plot["benchmark_version"].unique().tolist()
benchmark_order_list.sort(key=lambda x: int(x.split("_")[-1]))
benchmark_nb_order_list = [int(x.split("_")[-1]) for x in benchmark_order_list]

# Plot
#for tag in ["vs_sispea", "vs_pdf"]:
for tag in ["vs_pdf"]:
    fig = px.bar(data_to_plot, 
                y="indicator", 
                x="accuracy_"+tag, 
                color="benchmark_version", 
                category_orders={"benchmark_version": benchmark_order_list},
                #color_discrete_sequence=[px.colors.qualitative.Plotly[x%10] for x in benchmark_nb_order_list],
                height=600, #600,
                width=630, #800,
                barmode='group',
                title=f"Accuracy per indicator for {pdf_list_file} ({tag})")
    
    fig.layout.xaxis.tickformat = ',.0%'
    fig.update(layout_xaxis_range = [0,1], layout_xaxis_title="Accuracy")
    fig.show()

Plot the number of true positives (TP), true negatives (TN), false positives of kind 1 (FP1) and 2 (FP2) and false negatives (FN)

In [77]:
# Choose the PDF set used in the calculation of metrics per indicator
pdf_list_file = 'rpqs_eval_list_1+2.csv'
# Choose which benchmark versions the metrics should be calculated for
benchmark_list = ["benchmark_32", "benchmark_table_32"]
# Choose the benchmark version fixing indicator order (descending accuracy)
indic_order = "benchmark_32"

# Fix indicator order by accuracy
indic_order_list = (df_per_ind
                    .query("benchmark_version==@indic_order")
                    .query("pdf_list_file==@pdf_list_file") 
                    .sort_values(by="accuracy_vs_pdf", ascending=True) 
                    ["indicator"].to_list()
                    )

for benchmark_version in benchmark_list:
    # Select data to plot
    data_to_plot = df_per_ind[df_per_ind["pdf_list_file"]==pdf_list_file]
    data_to_plot = data_to_plot[data_to_plot["benchmark_version"]==benchmark_version]

    # Order indicator by accuracy
    data_to_plot = data_to_plot.copy()
    data_to_plot['indicator'] = pd.Categorical(data_to_plot['indicator'], categories=indic_order_list, ordered=True)
    data_to_plot = data_to_plot.sort_values("indicator")

    # Rename columns
    new_names_dict = {
        "tp_nb_vs_pdf": "True positives", 
        "tn_nb_vs_pdf": "True negatives", 
        "fp1_nb_vs_pdf": "False positives of kind 1", 
        "fp2_nb_vs_pdf" : "False positives of kind 2", 
        "fn_nb_vs_pdf": "False negatives"
    }
    data_to_plot = data_to_plot.rename(columns=new_names_dict)

    # Plot
    cols_to_show = list(new_names_dict.values())
    fig = px.bar(data_to_plot, 
                    y="indicator", 
                    x=cols_to_show,  
                    category_orders={"variable": cols_to_show},
                    #color_discrete_sequence=[px.colors.qualitative.G10[i] for i in [0, 5, 1, 2, 4]],
                    color_discrete_sequence=[px.colors.qualitative.Plotly[i] for i in [0, 5, 1, 6, 4]],
                    height=500,
                    width=630,
                    title=f"Metrics per indicator for {benchmark_version} and {pdf_list_file.removesuffix(".csv")}"
                    )

    fig.show()