# Convert gpt q&a corpus

In [2]:
import os
import pandas as pd
from pprint import pprint

In [3]:
# Make corpus-result from xlsx data

basedir = "../../data/"
filename = "gpt_corpus-20kq&A.xlsx"
sheet = "results"
df = pd.read_excel(basedir + filename, sheet_name=sheet, usecols="A:I")
df.dropna(how="all", inplace=True)
df.head()

Unnamed: 0,question,answer,prompt,token_answer,tokens_prompt,time_taken,urls_contexts,score_retrieving,chunks_index
0,Quelles sont les conditions requises pour béné...,Le supplément de loyer de solidarité (SLS) est...,Voici plusieurs références sur un même thème :...,321,2894,10.92,https://www.service-public.fr/particuliers/vos...,65.44\n63.40\n61.69,6860\n3844\n7721
1,Comment est calculé le surloyer pour les locat...,Le calcul du surloyer pour les locataires rési...,Voici plusieurs références sur un même thème :...,127,3394,6.61,https://www.service-public.fr/particuliers/vos...,68.72\n64.41\n64.04,5694\n12599\n7729
2,Quels sont les critères de revenus maximums po...,Pour être éligible au prêt locatif intermédiai...,Voici plusieurs références sur un même thème :...,343,3284,11.62,https://www.service-public.fr/particuliers/vos...,72.03\n66.38\n63.23,0\n3053\n3602
3,Quels documents sont nécessaires pour la régul...,Pour régulariser le supplément de loyer en cas...,Voici plusieurs références sur un même thème :...,452,3484,12.71,https://www.service-public.fr/professionnels-e...,73.26\n73.25\n72.84,13404\n9039\n2
4,Quelle est la différence entre le supplément d...,La différence entre le supplément de loyer de ...,Voici plusieurs références sur un même thème :...,542,3215,14.87,https://www.service-public.fr/particuliers/vos...,93.16\n91.38\n90.70,9070\n6860\n6


In [4]:
df.to_csv('corpus-results.csv', index=False)

In [5]:
# pprint(df.iloc[0].to_dict())

# Inference utils

In [5]:
# Inference utils

import pandas as pd
import time
import os


def extract_substring(text, start_tag, end_tag, include_tag=False):
    start_index = text.find(start_tag)
    if start_index == -1:
      return ""

    end_index = text[start_index:].find(end_tag)
    if end_index == -1:
        return ""
    end_index += start_index

    if include_tag:
        end_index += len(end_tag)
    else:
        start_index += len(start_tag)

    return text[start_index:end_index]
    


# Save answers

In [6]:
import hashlib
import pandas as pd

res = pd.read_csv('corpus-results.csv')
res.columns.tolist()  

['question',
 'answer',
 'prompt',
 'token_answer',
 'tokens_prompt',
 'time_taken',
 'urls_contexts',
 'score_retrieving',
 'chunks_index']

In [7]:
# Make eval-corpus from drive answer txt files.
eval_corpus = []
with_prompt = False
version = "v4"
ext = ""

for i, x in res.iterrows():
    question = x["question"]
    answer_xgen = x["answer"]
    references = x["urls_contexts"].split()
    hash = hashlib.blake2b(question.encode(), digest_size=4).hexdigest()

    with open(f"answers/llama_ref_{version}{ext}/{i}.txt") as f:
        a_ref = f.read()

    with open(f"answers/llama_ref_{version}{ext}_sample/{i}.txt") as f:
        a_ref_sample = f.read()


    prompt = x["prompt"]
    ref1 = extract_substring(prompt, "référence 0", "\n", include_tag=True)
    ref2 = extract_substring(prompt, "référence 1", "\n", include_tag=True)
    ref3 = extract_substring(prompt, "référence 2", "\n", include_tag=True)
    
    item = {
        "qid": hash,
        "question": question,
        "answer_xgen": answer_xgen,
        "references": references,
        "references_raw": [ref1, ref2, ref3],
        "answer_llama_ref": a_ref,
        "answer_llama_ref_sample": a_ref_sample,
    }
    if with_prompt:
        p = "Voici une liste de références :\n>>REFERENCES<<\n" + ref1 + ref2 + ref3 + "\n" + ">>QUESTION<<\n" + question
        item['prompt'] = p,

    eval_corpus.append(item)
    if i == 49:
        break

fn = f"corpus-eval-with-prompt-{version}.json" if with_prompt else f"corpus-eval-{version}.json"
pd.DataFrame(eval_corpus).to_json(fn, orient="records", indent=2, force_ascii=False)

In [63]:
#print(res.iloc[0]["prompt"])

In [23]:
res["prompt"].apply(lambda x: len(x.split())).mean()

1486.2720375024164

# Quantitative Evaluation

In [47]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

# Tfidf feature extractions
# --

# Get documents from chunks
with open("xmlfiles_as_chunks.json") as f:
    documents = [x["data"] for x in json.load(f)]

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words=fr_stop, min_df=0.05, max_df=0.9,
                             token_pattern=r'\b[A-Za-z_][A-Za-z_]+\b')

# Compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names()

# Function to extract the top n most salient words from a document
def extract_top_words(document, n=3):
    # Compute the TF-IDF scores for the given document
    #scores = tfidf_matrix[document]
    scores = vectorizer.transform([document])

    # Sort the TF-IDF scores in descending order
    sorted_indices = scores.toarray()[0].argsort()[::-1]

    # Extract the top n most salient words
    top_words = [feature_names[i] for i in sorted_indices[:n]]

    return top_words

In [55]:
# Draw Evalutation table
# --

corpus = pd.read_json("corpus-eval-v4.json")
res = {}
for i, x in corpus.iterrows():
    cols = [
        ("Xgen", x["answer_xgen"]),
        ("Llama sample+rep-penalty", x["answer_llama_base_ref"]),
        ("Llama rep-penaly", x["answer_llama_ref"]),
        ("Llama sample", x["answer_llama_ref_sample"]),
        ("Llama beams", x["answer_llama_ref_beams"]),
    ]
    for title, answer in cols:
        data = res.get(title, {})
        # Answer length
        # --
        size = data.get("size", [])
        size.append(len(answer.split()))
        data["size"] = size
        # No results
        # --
        size = data.get("no result", [])
        size.append(len(answer.split(".")) in [1,2] and "pas" in answer.split())
        data["no result"] = size
        # Reference mention
        # --
        size = data.get("ref mention", [])
        size.append("références" in answer.split())
        data["ref mention"] = size

        # Ratio of best words
        #...
        
        res[title] = data

# Calculate the mean and standard deviation for each list and store them in a new dictionary
summary_data = {}
for outer_key, inner_dict in res.items():
    for inner_key, inner_list in inner_dict.items():
        mean_value = np.mean(inner_list)
        std_value = np.std(inner_list)
        if inner_key in  ['no result', "ref mention"]:
            # Count occurences
            summary_data.setdefault(inner_key, []).append(str(sum(inner_list)))
        else:
            summary_data.setdefault(inner_key, []).append(f"{mean_value:.0f} ± {std_value:.0f}")


# Create a pandas DataFrame from the summary_data dictionary
df = pd.DataFrame(summary_data, index=res.keys())

# Define a function to highlight max value an min values
def highlight_max_min(s):
    # if '±' not in s.str:
    #     return s
    mean_values = s.str.split(' ± ').str[0].astype(float)  # Extract mean values from the string
    is_max = mean_values == mean_values.max()
    is_min = mean_values == mean_values.min()   
    return ['background-color: #dbf1f9' if v else 'background-color: #eadddd' if m else '' for v, m in zip(is_max, is_min)]

styled_df = df.style.apply(highlight_max_min)
styled_df

Unnamed: 0,size,no result,ref mention
Xgen,195 ± 87,1,5
Llama sample+rep-penalty,126 ± 59,1,11
Llama rep-penaly,91 ± 55,1,4
Llama sample,116 ± 68,2,9
Llama beams,121 ± 69,4,9


# Qualitative Evaluation

In [48]:
import IPython.display as display
import pandas as pd

corpus = pd.read_json("corpus-eval-v4.json")

def draw(view):
    display.display(display.HTML(view))

def draw_cols(cols, with_refs=False):
    col_len = 100 // len(cols)
    draw(f'''
        <style>
            .qq {{
                word-break: keep-all;
                white-space: pre-wrap;
            }}
        </style>

        <div style="border:1px solid grey;padding:5px;border-radius:8px;">
            <div>ID: {qid} &nbsp Q: {i}</div>
            <h5 style="font-weight: bold;">{question}</h5>
            <div style="display:flex; align-items:top;">'''
                    + ''.join([
                                f'<div style="width: {col_len}%; padding:10px;"> <div style="text-align:center;color:blue;">{title}</div> <pre class="qq">{answer}</pre></div>'
                                for title, answer in cols
                                ])
            + '</div>'
            + str(extract_top_words(cols[1][1], 5))
            + (f'''<br><br>{"="*100}<br><br>
                 <pre>REFERENCES:<br><br>{"<br>".join(x["references_raw"])}
                 </pre>''' if with_refs else '')
        + '</div>')
    draw("<br>")
   

for i, x in corpus.iterrows():
    qid = x["qid"]
    question = x["question"]

    cols = [
        #("Xgen", x["answer_xgen"]),
        ("Llama sample+rep-penalty", x["answer_llama_base_ref"]),
        ("Lama rep-penaly", x["answer_llama_ref"]),
        ("Llama sample", x["answer_llama_ref_sample"]),
        ("Llama beams", x["answer_llama_ref_beams"]),
    ]

    draw_cols(cols, with_refs=False)