# Notebook to test the module qamodel

In [2]:
import pandas as pd
import os
import sys
from time import time 

sys.path.append("../../")    # Add the path to the root directory (where we can find the folder .git)

%load_ext autoreload
%autoreload 2 

from narval.utils import get_data_dir
from narval.pdfreader import PDFReader
from narval.pagefinder import PageFinder
from narval.qamodel import T5QuestionAnswering, Llama3QuestionAnswering



  machar = _get_machar(dtype)
  from .autonotebook import tqdm as notebook_tqdm


### Extract text

In [3]:
data_dir = get_data_dir()
file_path = "/data/input/pdfs/"
file_name = "RPQS_SIDEALF_AC_2021.pdf"
FILE_PATH = data_dir + file_path + file_name

pdf_reader = PDFReader(FILE_PATH)
pages = pdf_reader.textpages

### Find relevant pages for each question

In [5]:
data_dir = get_data_dir()
question_keyword_path = data_dir + "/data/input/question_keyword_malou.csv"

competence = "assainissement collectif"
pagefinder = PageFinder(question_keyword_path, competence)
pages_df = pagefinder.extract_relevant_pages(pages)

pages_df.head()

Unnamed: 0,indicator,question,keyword_regex,relevant_pages
0,D203.0,Quelle est la valeur de l'indicateur D203.0,\bD203.0s?\b,"[1, 10]"
1,D203.0,Quelle est la quantité de boues évacuées (D203.0),\b bouess?\b,"[1, 2, 10, 14, 15, 18, 19]"
2,D204.0,Quelle est la valeur de l'indicateur D204.0,\bD204.0s?\b,"[1, 13]"
3,D204.0,Quel est le prix du service au m3 pour l'assin...,\bprixs?\b,"[0, 13, 14, 22]"
4,D204.0,Quel est le prix du service au m3 pour l'assin...,\bprixs?\b,"[0, 13, 14, 22]"


### Predict an answer with the T5 model

Choose a question

In [6]:
indic, question, _, relevant_pages_list = pages_df.loc[22]

In [7]:
print("question = ", question)
print("relevant pages = ", relevant_pages_list)

question =   Quel est le nombre de points noirs réseau (P252.2)
relevant pages =  [1, 19]


Define prompt parameters

In [15]:
year = "2021"

prompt_params = {
    "year": year
}

prompt_version="T5_prompt_v1"

Predict an answer

In [9]:
t5_model = T5QuestionAnswering(model_name="google/flan-t5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Device =  cpu


In [16]:
answer_list = []
for page_num in relevant_pages_list:
    context = pages[page_num]
    prompt_params.update({"context": context, "question": question})
    prompt = t5_model.format_prompt(
        prompt_params, 
        prompt_version
    )
    print(f"Prompt for {page_num}:\n{"#"*20}\n{prompt}\n")
    answer = t5_model.predict(prompt)
    answer_list.append(answer)

    print(f"Prompt for {page_num}:\n{"#"*20}\n{prompt}\n")
    print(f"Answer from page {page_num}:\n{"#"*20}\n{answer}\n")

Prompt for 1:
####################

Contexte: Table des mat ières
1. Caractérisation technique du service ................................ ................................ ................................  1 
1.1. Présentation du territoire desservi  ................................ ................................ ............................  1 
1.2. Mode de gestion du service  ................................ ................................ ................................ ..... 1 
1.3. Estimation de la population desservie (D201. 0) ................................ ................................ ...... 2 
1.4. Nombre d’abonnés  ................................ ................................ ................................ ...................  2 
1.5. V olumes facturés  ................................ ................................ ................................ ......................  3 
1.6. Détail des imports et exports d’effluents  ................................ ...

In [17]:
answer_list

['3.9.', '0 par 10 km']

$\rightarrow$ the answer is there. How to identify it? Needs to be in units of number/100km

Try by concatenating the 2 pages

In [130]:
bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])
prompt = t5_model.format_prompt(bigcontext, question, prompt_params, prompt_version)
answer = t5_model.predict(prompt)
print(answer)


Number of input tokens =  1722
0 par 10 km


$\rightarrow$ OK, it works. That's unexpected because the max number of tokens (512) is exceeded for bigcontext+question.  
Check with

In [131]:
print(f"Max length from model config: {t5_model.model.config.max_length}")
print(f"Max position embeddings: {t5_model.model.config.n_positions}")

Max length from model config: 20
Max position embeddings: 512


Try by giving the full text (just a test) $\rightarrow$ out of memory error

In [132]:
fullcontext = " ".join([pages[page_num] for page_num in range(len(pages))])
prompt = t5_model.format_prompt(fullcontext, question, prompt_params, prompt_version)
answer = t5_model.predict(prompt)
print(answer)

Number of input tokens =  13543


KeyboardInterrupt: 

Try with another question by concatenating relevant pages


In [133]:
question_idx = 0 # choose a question

indic, question, _, relevant_pages_list = pages_df.loc[question_idx]

bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])
prompt = t5_model.format_prompt(bigcontext, question, prompt_params, prompt_version)
answer = t5_model.predict(prompt, model_max_length=10)

print("question = ", question)
print("answer = ", answer)

Number of input tokens =  1613
question =   Quelle est la valeur de l'indicateur D203.0
answer =  142.760 t 216.


In [134]:
question_idx = 2 # choose a question

indic, question, _, relevant_pages_list = pages_df.loc[question_idx]

bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])
prompt = t5_model.format_prompt(bigcontext, question, prompt_params, prompt_version)
answer = t5_model.predict(prompt, model_max_length=10)

print("question = ", question)
print("answer = ", answer)

Number of input tokens =  2022
question =   Quelle est la valeur de l'indicateur D204.0
answer =  t ype


$\rightarrow$ wrong!

#### Test with T5-xl

In [135]:
t5_model = T5QuestionAnswering(model_name="google/flan-t5-xl")

Downloading shards: 100%|██████████| 2/2 [01:25<00:00, 42.97s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.20it/s]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


Device =  cpu


In [136]:
question_idx = 2 # choose a question

indic, question, _, relevant_pages_list = pages_df.loc[question_idx]
print("Explored pages are ", relevant_pages_list)
print("question = ", question)

bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])
prompt = t5_model.format_prompt(bigcontext, question, prompt_params, prompt_version)
answer = t5_model.predict(prompt, model_max_length=10)

print("answer = ", answer)



Token indices sequence length is longer than the specified maximum sequence length for this model (2022 > 512). Running this sequence through the model will result in indexing errors


Explored pages are  [1, 13]
question =   Quelle est la valeur de l'indicateur D204.0
Number of input tokens =  2022


KeyboardInterrupt: 

#### Investigate tokenization

In [13]:
#input_text = "Je vais au parc et c'est cool."

test_context = "Je vais au parc pour faire du vélo et il va faire beau."
test_question = "Tu vas où? Je ne sais pas si je viens avec toi."


input_text = f"context: {test_context} question: {test_question}"

token_ids = t5_model.tokenizer(
            input_text,
            max_length=5,  # Max length for the entire input (context+question)
            truncation="only_first",        #True,   #"only_second",
            padding=False,
            return_tensors="pt",
        )

tokens = t5_model.tokenizer.convert_ids_to_tokens(token_ids['input_ids'][0])

print(tokens)
print("Number of tokens = ", len(token_ids['input_ids'][0]))

['▁context', ':', '▁Je', '▁vais', '</s>']
Number of tokens =  5


$\rightarrow$ truncation="only_second" does not truncate anything since there no pairs given in input.  
However truncation="only_first" or truncation=True works as expected.  
So in the example above, there was no truncation and the model worked (ie found the correct answer with a big context).  
Weird ... what is the maximum context size for this model?

In [44]:
input_text = bigcontext + full_question

token_ids = t5_model.tokenizer(
            input_text,
            truncation=False,
            padding=False,
            return_tensors="pt",
        )

tokens = t5_model.tokenizer.convert_ids_to_tokens(token_ids['input_ids'][0])
print(tokens)

print("Number of tokens = ", len(token_ids['input_ids'][0]))


['▁Table', '▁des', '▁mat', '▁', 'ières', '▁1.', '▁Car', 'act', 'éri', 's', 'ation', '▁technique', '▁du', '▁service', '▁', '................', '................', '▁', '................', '................', '▁', '................', '................', '▁1', '▁', '1.1', '.', '▁Pré', 'sent', 'ation', '▁du', '▁territoire', '▁des', 'serv', 'i', '▁', '................', '................', '▁', '................', '................', '▁', '....', '................', '........', '▁1', '▁', '1.2', '.', '▁Mode', '▁de', '▁gestion', '▁du', '▁service', '▁', '................', '................', '▁', '................', '................', '▁', '................', '................', '▁', '.....', '▁1', '▁', '1.3', '.', '▁Estimat', 'i', 'on', '▁de', '▁la', '▁population', '▁des', 'serv', 'i', 'e', '▁(', 'D', '201', '.', '▁', '0', ')', '▁', '................', '................', '▁', '................', '................', '▁', '......', '▁2', '▁', '1.4', '.', '▁N', 'ombre', '▁', 'd', '’', 'a', '

### Basic tests with the Llama3.1 model

Choose a question

In [4]:
question, _, relevant_pages_list = pages_df.loc[0]

In [5]:
print("question = ", question)
print("relevant pages = ", relevant_pages_list)

question =   Quel est le nombre de points noirs réseau (P252.2)
relevant pages =  [1, 19]


Complete the question with the year

In [6]:
year = "2021"
full_question = f"{question} en {year} ?"
print("full question = ", full_question)

full question =   Quel est le nombre de points noirs réseau (P252.2) en 2021 ?


Check the version of the transformers package

In [19]:
import transformers
print(transformers.__version__)       # Must be >=4.43.0

4.45.0


Load the model

In [9]:
from huggingface_hub import login

hf_token = os.environ["HF_TOKEN"]
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/onyxia/.cache/huggingface/token
Login successful


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [23]:

## Load Llama model and tokenizer
#model_name = "meta-llama/Meta-Llama-3-8B"  
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModelForCausalLM.from_pretrained(model_name)

## Create a pipeline for question answering
#llama_pipeline = pipeline("question-answering", \
#    model=model, tokenizer=tokenizer, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")



Loading checkpoint shards: 100%|██████████| 4/4 [00:48<00:00, 12.12s/it]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'LlamaForCausalLM' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'ErnieMForQuestionAnswering', 'FalconForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPT2ForQuestionAnswering', 'GPTNeoForQuestionAnswering', 'G

$\rightarrow$ Downloading the model "meta-llama/Meta-Llama-3-8B" from HuggingFace on the SSPCloud takes roughly 4 minutes ...

In [20]:

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

llama_pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.94it/s]
Some parameters are on the meta device because they were offloaded to the cpu.


Test the default example

In [36]:

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

terminators = [
    llama_pipeline.tokenizer.eos_token_id,
    llama_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = llama_pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


{'role': 'assistant', 'content': "Arrrr, me hearty! Me name be Captain Chat, the scurviest pirate chatbot to ever sail the seven seas... er, chat with ye landlubbers! Me and me trusty parrot, Polly, be here to swab the decks of yer conversations and keep ye entertained with me pirate-tastic responses! So hoist the colors, me hearty, and let's set sail fer a swashbucklin' good time!"}


Test with our question and context (obtained by concatenating all pages that are relevant for that questions)

In [21]:
bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])

In [22]:
bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])

messages = [
    {"role": "system", "content": bigcontext},
    {"role": "user", "content": full_question},
]

terminators = [
    llama_pipeline.tokenizer.eos_token_id,
    llama_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = llama_pipeline(
    messages,
    max_new_tokens=50,
    eos_token_id=terminators,
)

print(f"Full question = {full_question}")
print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Full question =  Quel est le nombre de points noirs réseau (P252.2) en 2021 ?
{'role': 'assistant', 'content': "Selon le document, le nombre de points noirs du réseau de collecte (P252.2) pour l'exercice 2021 est de 0 par 10 km de réseau."}


In [23]:
outputs

[{'generated_text': [{'role': 'system',
    'content': "Table des mat ières\n1. Caractérisation technique du service ................................ ................................ ................................  1 \n1.1. Présentation du territoire desservi  ................................ ................................ ............................  1 \n1.2. Mode de gestion du service  ................................ ................................ ................................ ..... 1 \n1.3. Estimation de la population desservie (D201. 0) ................................ ................................ ...... 2 \n1.4. Nombre d’abonnés  ................................ ................................ ................................ ...................  2 \n1.5. V olumes facturés  ................................ ................................ ................................ ......................  3 \n1.6. Détail des imports et exports d’effluents  ...................

$\rightarrow$ gives the correct answer but very slow compared to T5 (45s vs 5s)

Test with our question and the full document

In [31]:
fullcontext = " ".join([pages[page_num] for page_num in range(len(pages))])

messages = [
    {"role": "system", "content": fullcontext},
    {"role": "user", "content": full_question},
]

terminators = [
    llama_pipeline.tokenizer.eos_token_id,
    llama_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]   


outputs = llama_pipeline(
    messages,
    max_new_tokens=50,
    eos_token_id=terminators,
)

print(f"Full question = {full_question}")
print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 11.41 GiB. GPU 0 has a total capacity of 14.58 GiB of which 431.56 MiB is free. Process 883028 has 14.15 GiB memory in use. Of the allocated memory 13.11 GiB is allocated by PyTorch, and 941.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

$\rightarrow$ "out of memory" if the whole text is given as input

Repeat the test with a quantized model

In [17]:
# Required for quantization
!pip install bitsandbytes



In [19]:
!pip install -U bitsandbytes



In [22]:
torch.cuda.is_available()

True

In [23]:
print(torch.__version__)

2.4.1+cu121


In [21]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

quantized_llama_pipeline = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={
        "torch_dtype": torch.bfloat16,
        "quantization_config": {"load_in_4bit": True},
        "low_cpu_mem_usage": True,
    },
    device_map="auto",
)

ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

### Tests with Llama3 and various values of `max_new_tokens`

In [8]:
llama3_pipe = Llama3QuestionAnswering()

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.44s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


Device =  cuda:0


In [9]:
question_idx = 0 # choose a question
year = "2021"    # year of the report

question, _, relevant_pages_list = pages_df.loc[question_idx]
full_question = f"{question} en {year} ?"

bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])

print("full question = ", full_question)

full question =   Quel est le nombre de points noirs réseau (P252.2) en 2021 ?


In [10]:
for max_new_tokens in [10, 30, 50, 75, 100]:
    t0 = time()
    answer = llama3_pipe.predict(bigcontext, full_question, max_new_tokens=max_new_tokens)
    print("Computation time (s) = ", time()-t0)
    print("max_new_tokens = ", max_new_tokens)
    print("Answer = ", answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  20.782516479492188
max_new_tokens =  10
Answer =  Selon le texte, le nombre de points no


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  39.20896649360657
max_new_tokens =  30
Answer =  Selon le texte, le nombre de points noirs du réseau de collecte (P252.2) pour l'exercice 2021


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  51.20070552825928
max_new_tokens =  50
Answer =  Selon le texte, le nombre de points noirs du réseau de collecte (P252.2) pour l'exercice 2021 est de 0 par 10 km de réseau.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  51.437586307525635
max_new_tokens =  75
Answer =  Selon le tableau, le nombre de points noirs du réseau de collecte (P252.2) pour l'exercice 2021 est de 0 par 10 km de réseau.
Computation time (s) =  47.57435965538025
max_new_tokens =  100
Answer =  Selon le tableau, le nombre de points noirs du réseau de collecte (P252.2) en 2021 est de 0 par 10 km de réseau.


Let's try to ask Llama3 to be less verbose

In [13]:
question_idx = 0 # choose a question
year = "2021"    # year of the report

question, _, relevant_pages_list = pages_df.loc[question_idx]
full_question = f"{question} en {year} ?"
comment = " Extrais uniquement la réponse avec unité."
full_question = full_question + comment

bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])

print("full question = ", full_question)

full question =   Quel est le nombre de points noirs réseau (P252.2) en 2021 ? Extrais uniquement la réponse avec unité.


In [14]:
for max_new_tokens in [10, 30, 50, 75, 100]:
    t0 = time()
    answer = llama3_pipe.predict(bigcontext, full_question, max_new_tokens=max_new_tokens)
    print("Computation time (s) = ", time()-t0)
    print("max_new_tokens = ", max_new_tokens)
    print("Answer = ", answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  19.626792907714844
max_new_tokens =  10
Answer =  -pas de points noirs connus


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  21.237425804138184
max_new_tokens =  30
Answer =  -0 par 10 km de réseau.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  20.65757155418396
max_new_tokens =  50
Answer =  0 par 10 km de réseau.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  21.079474687576294
max_new_tokens =  75
Answer =  0 par 10 km de réseau.
Computation time (s) =  20.99551558494568
max_new_tokens =  100
Answer =  0 par 10 km de réseau.


Repeat for another question

In [15]:
pages_df

Unnamed: 0,question,keyword_regex,relevant_pages
0,Quel est le nombre de points noirs réseau (P2...,noirs,"[1, 19]"
1,Quel est le taux d'impayés (P257.0),impayés,"[1, 22]"
2,Quel est le taux d'impayés pour l'assainissem...,impayés,"[1, 22]"
3,Quel est le taux de conformité des boues évac...,boues,"[1, 2, 10, 14, 15, 18, 19]"
4,Quel est le taux de conformité des performanc...,performance,"[1, 16, 17, 20, 23]"
5,Quel est le total de points de connaissance d...,connaissance,"[16, 21]"
6,Quelle est la valeur de l'indicateur D203.0,D203.0,"[1, 10]"
7,Quelle est la valeur de l'indicateur D204.0,D204.0,"[1, 13]"
8,Quelle est la valeur de l'indicateur P202.2,P202.2,[]
9,Quelle est la valeur de l'indicateur P253.2,P253.2,"[1, 19]"


In [16]:
question_idx = 28 # choose a question
year = "2021"    # year of the report

question, _, relevant_pages_list = pages_df.loc[question_idx]
full_question = f"{question} en {year} ?"
comment = " Extrais uniquement la réponse avec unité."
full_question = full_question + comment

bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])

print("Explored pages are ", relevant_pages_list)
print("full question = ", full_question)

Explored pages are  [1, 16]
full question =  Quelle est la valeur de l'indicateur P201.1 en 2021 ? Extrais uniquement la réponse avec unité.


In [17]:
for max_new_tokens in [10, 30, 50, 75, 100]:
    t0 = time()
    answer = llama3_pipe.predict(bigcontext, full_question, max_new_tokens=max_new_tokens)
    print("Computation time (s) = ", time()-t0)
    print("max_new_tokens = ", max_new_tokens)
    print("Answer = ", answer)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  18.513556957244873
max_new_tokens =  10
Answer =  84,09 %


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  19.248411893844604
max_new_tokens =  30
Answer =  84,09 %


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  24.08448052406311
max_new_tokens =  50
Answer =  84,09 %


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Computation time (s) =  23.047784328460693
max_new_tokens =  75
Answer =  84,09 %
Computation time (s) =  19.748436450958252
max_new_tokens =  100
Answer =  84,09 %


$\rightarrow$ OK the answer (84.09% for P201.1) is correct (in roughly 20s)

Try to extract the same answer with another question for the same indicator and having a much larger context

In [18]:
question_idx = 12 # choose a question
year = "2021"    # year of the report

question, _, relevant_pages_list = pages_df.loc[question_idx]
full_question = f"{question} en {year} ?"
comment = " Extrais uniquement la réponse avec unité."
full_question = full_question + comment

bigcontext = " ".join([pages[page_num] for page_num in relevant_pages_list])

print("Explored pages are ", relevant_pages_list)
print("full question = ", full_question)

Explored pages are  [1, 3, 4, 5, 7, 8, 9, 13, 16, 22]
full question =  Quel est le pourcentage d'abonnés desservis par le réseau(P201.1) en 2021 ? Extrais uniquement la réponse avec unité.


In [19]:
for max_new_tokens in [10, 30, 50, 75, 100]:
    t0 = time()
    answer = llama3_pipe.predict(bigcontext, full_question, max_new_tokens=max_new_tokens)
    print("Computation time (s) = ", time()-t0)
    print("max_new_tokens = ", max_new_tokens)
    print("Answer = ", answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.97 GiB. GPU 0 has a total capacity of 14.58 GiB of which 1.26 GiB is free. Process 2294608 has 13.31 GiB memory in use. Of the allocated memory 12.46 GiB is allocated by PyTorch, and 749.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)