<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/eval_ENEM_2022_with_HF_and_chatGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q datasets transformers rank_bm25 openai accelerate bitsandbytes

In [2]:
# ---------------------------------------------------------- #
# This code is from https://github.com/piresramon/gpt-4-enem #
# ---------------------------------------------------------- #

import json
import torch
import random 
import collections

import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('max_colwidth', 400)

from tqdm.auto import tqdm

import openai

import rank_bm25

from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score
from datasets import Dataset

if torch.cuda.is_available(): 
    device = 'cuda'  
else: 
    device ='cpu'

MANUAL_SEED = 2711
rnd = random.Random()
rnd.seed(MANUAL_SEED)
def deterministic(rep=True, manual_seed=MANUAL_SEED):
    if rep:
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Experimento deterministico, seed: {manual_seed}')
        if device == 'cuda':
            print(f'Existe {torch.cuda.device_count()} GPU\
            {torch.cuda.get_device_name(0)} disponível.')
    else:
        print('Experimento randomico')
deterministic()    

Experimento deterministico, seed: 2711
Existe 1 GPU            Tesla T4 disponível.


# Get Enem json and create enem_data 

In [3]:
def ignore_question(doc):
    filters = {
        'IU': False,
        # 'MR': False,  # uncomment to filter out MR
        # 'CE': False,  # uncomment to filter out CE
        'ML': False,
    }
    for k,v in filters.items():
        if doc[k] != v:
            return True
    return False

def _process_doc_cot(doc):
    def format_example(doc, choices):
        """
            Passagem: <passage>
            Pergunta: <question>
            Choices:
            A. <choice1>
            B. <choice2>
            C. <choice3>
            D. <choice4>
            Answer:
        """
        prompt = "Cabeçalho: " + doc["context"] + "\n"
        prompt += "Enunciado: " + doc["question"] + "\nAlternativas:\n"
        for choice, option in zip(choices, doc["options"]):
            prompt += f"{choice.upper()}. {option}\n"
        
        prompt += "Explicação: " + doc.get("explanation", "")
        return prompt.strip()
    choices = ['a', 'b', 'c', 'd', 'e']
    return {
        "query": format_example(doc, choices),
        "choices": doc["options"],
        "gold": choices.index(doc["label"]),
        "id": doc["id"],
        "exam": doc["exam"],
    }    

dataset = collections.defaultdict(list)
        
data_path = "/content/drive/MyDrive/LLMs/ENEM/ENEMdataset/2022.json"
with open(data_path) as f:
    documents = json.load(f)

documents = list(filter(lambda doc: not ignore_question(doc), documents))
dataset['test'] = list(map(_process_doc_cot, documents))

enem_data = {ix:doc  for ix, doc in enumerate(dataset['test'])}

print(f"tamanho dataset: {len(enem_data)}")

enem_data[0]    

tamanho dataset: 118


{'query': 'Cabeçalho: A conquista da medalha de prata por Rayssa Leal, no skate street nos Jogos Olímpicos, é exemplo da representatividade feminina no esporte, avalia a âncora do jornal da rede de televisão da CNN. A apresentadora, que também anda de skate, celebrou a vitória da brasileira, que entrou para a história como a atleta mais nova a subir num pódio defendendo o Brasil. “Essa representatividade do esporte nos Jogos faz pensarmos que não temos que ficar nos encaixando em nenhum lugar. Posso gostar de passar notícia e, mesmo assim, gostar de skate, subir montanha, mergulhar, andar de bike, fazer yoga. Temos que parar de ficar enquadrando as pessoas dentro de regras. A gente vive num padrão no qual a menina ganha boneca, mas por que também não fazer um esporte de aventura? Por que o homem pode se machucar, cair de joelhos, e a menina tem que estar sempre lindinha dentro de um padrão? Acabamos limitando os talentos das pessoas”, afirmou a jornalista, sobre a prática do skate por 

# Create Prompts
- #### Dynamic Prompt (fewshot sampled)
- #### Dynamic Similar Prompt (bm25 rank fewshot)

In [4]:
def dynamic_similar_prompt(doc_id, data, topk=3):
    key_predict = [k for k, v in enem_data.items() if v['id'].startswith(doc_id)][0]
    query_list = [v['query'] for k,v in data.items()]
    tokenized_corpus = [doc.split(" ") for doc in query_list]
    bm25 = rank_bm25.BM25Plus(tokenized_corpus)

    query = data[key_predict]['query']
    tokenized_query = query.split(" ")
    
    doc_scores = bm25.get_scores(tokenized_query)
    
    # [1:] remove query with itself ---> topk+1
    top_similar_idxs = list(doc_scores.argsort()[-(topk+1):][::-1])[1:]
    if key_predict in top_similar_idxs: # k in topk_sim ---> must be false
        print('Error BM25 prompt similar retrieval')
    
    return top_similar_idxs


def dynamyc_fewshot_examples(num_fewshot, doc_id, sim_prompt=True, data=enem_data):
    if sim_prompt:
        topk_sim = dynamic_similar_prompt(doc_id, data, topk=num_fewshot)
        fewshot_ex = [enem_data[k] for k in topk_sim]

    else:
        # filter the sample in current predict from data
        all_possible_fewshot_keys = [k for k, v in data.items() if not v['id'].startswith(doc_id)]
        fewshot_keys = rnd.sample(all_possible_fewshot_keys, num_fewshot)
        fewshot_ex = [enem_data[k] for k in fewshot_keys]

    return fewshot_ex


def fewshot_context(doc, num_fewshot, similar_prompt=True, enem_data=enem_data):
    if num_fewshot == 0 and fewshotex is None:
        labeled_examples = ""
    else:
        fewshotex = dynamyc_fewshot_examples(
            num_fewshot=num_fewshot, doc_id=doc['id'], sim_prompt=similar_prompt, data=enem_data
        )
        
        labeled_examples = ''
        for i, doc_ex in enumerate(fewshotex):
            labeled_examples += f'Questão {i+1}:\n'
            labeled_examples += doc_ex['query'] + " " + ['A.', 'B.', 'C.', 'D.', 'E.'][doc_ex['gold']].upper()
            labeled_examples += '\n##\n'
        labeled_examples += f'Questão {len(fewshotex) + 1}:\n'

    example = doc['query']
    
    return labeled_examples + example

# -------------------------------------------------------------------------------------- #
# number of fewshots, must be > 0

# create inputs ---> list with fewshot samples + question to be answer 
two_shot, three_shot = [], []
for k,doc in enem_data.items():
    two_shot.append(fewshot_context(doc=doc,   num_fewshot=2, similar_prompt=True, enem_data=enem_data))
    three_shot.append(fewshot_context(doc=doc, num_fewshot=3, similar_prompt=True, enem_data=enem_data))

# Eval ENEM-2022 with ChatGPT 

In [6]:
EVAL_CHATGPT = False

if EVAL_CHATGPT:
    # OPENAI_API_KEY = "secret"
    # openai.api_key = OPENAI_API_KEY

    deterministic()    

    trues, preds = [], []
    loop = tqdm(two_shot, leave=True)

    for ix, batch in enumerate(loop):

        # to make sure the same example to predict has the correct label 
        # find the point where the question to be evaluate starts
        point = two_shot[ix].find('Questão 3:\n')
        
        # "11+point" ---> is the start of the query-text, so this is a naive approach
        if enem_data[ix]['query'] == two_shot[ix][11+point:]:
            chatGPT_response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": two_shot[ix]}])
            
            preds.append(chatGPT_response['choices'][0]['message']['content'])
            trues.append(enem_data[ix]['gold'])
        else:
            # if the query-text is different from inputs to be evaluated 
            # we print for further investigation
            print(point, ix)

Experimento deterministico, seed: 2711
Existe 1 GPU            Tesla T4 disponível.


  0%|          | 0/118 [00:00<?, ?it/s]

In [7]:
def get_results(trues, preds, chatgpt=False):
    dataframe = pd.DataFrame({'true': trues, 'pred':preds})

    #get only latter
    if chatgpt:
        dataframe['pred'] = dataframe.pred.apply(lambda x: x[:1])
    
    else:
        dataframe['pred'] = dataframe.pred.apply(lambda x: x[-1:])
    
    map_label = {0:"A", 1:"B", 2:"C", 3:"D", 4:"E"}
    dataframe['true'] = dataframe.true.apply(lambda x: map_label[x])
    
    acc = accuracy_score(dataframe['true'], dataframe['pred'])
    
    return dataframe, acc


if EVAL_CHATGPT:
    chatGPT_results, acc = get_results(trues, preds, chatgpt=True)

    print(f'ACC of 2-shot chatGPT on ENEM-2022: {acc:.3}')
    # ACC of ChatGPT: 0.771

    chatGPT_results

ACC of 2-shot chatGPT on ENEM-2022: 0.703


# Open Source Models

- #### with HuggingFace we will evaluate GPT2/LLama & Alpaca/Bloom models.

In [8]:
model_gpt = 'gpt2'
tokenizer_gpt = AutoTokenizer.from_pretrained(model_gpt)

model_bloom = 'bigscience/bloomz-7b1-mt'
tokenizer_bloom = AutoTokenizer.from_pretrained(model_bloom)

def get_input_lenghts(dataset, tokenizer):
    num_words = [len(x.split()) for x in dataset]
    num_words = torch.tensor(num_words, dtype=torch.float)
    tokens_lengths = [len(tokenizer.encode(seq)) for seq in dataset]
    tokens_lengths = torch.tensor(tokens_lengths, dtype=torch.float)
    
    return int(torch.ceil(num_words.mean()).item()), tokens_lengths

# -------------------------------------------------------------------- #
num_words, gpt2_lengths = get_input_lenghts(three_shot, tokenizer_gpt)
min_gpt2 = torch.ceil(gpt2_lengths.min()).item()
max_gpt2 = torch.ceil(gpt2_lengths.max()).item()
mean_gpt2 = torch.ceil(gpt2_lengths.mean()).item()
# -------------------------------------------------------------------- #
num_words, bloom_lengths = get_input_lenghts(three_shot, tokenizer_bloom)
min_bloom = torch.ceil(bloom_lengths.min()).item()
max_bloom = torch.ceil(bloom_lengths.max()).item()
mean_bloom = torch.ceil(bloom_lengths.mean()).item()
# -------------------------------------------------------------------- #
print('\n\n')
print('--'*33)
print(f' Prompt 3-shot com média de {num_words} palavras em  ENEM2022 dataset')
print('--'*33)
print(f'GPT2 tem média de {int(mean_gpt2)} tokens em 3-shot')
print(f'\nBloom tem média de {int(mean_bloom)} tokens em 3-shot')
print(f'\n---> Bloom gasta {(int(mean_bloom)/int(mean_gpt2))*100:.3}% de tokens comparado com o GPT2 <---')

Token indices sequence length is longer than the specified maximum sequence length for this model (2863 > 1024). Running this sequence through the model will result in indexing errors





------------------------------------------------------------------
 Prompt 3-shot com média de 862 palavras em  ENEM2022 dataset
------------------------------------------------------------------
GPT2 tem média de 2061 tokens em 3-shot

Bloom tem média de 1245 tokens em 3-shot

---> Bloom gasta 60.4% de tokens comparado com o GPT2 <---


# Eval BloomZ 7b mt

In [9]:
# load bloom in 8bits with bitsandbytes
BLOOM_model = AutoModelForCausalLM.from_pretrained(model_bloom, device_map="auto", load_in_8bit=True)




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [10]:
EVAL_OPEN_SOURCE = True

if EVAL_OPEN_SOURCE:
    deterministic()    

    trues, preds = [], []
    loop = tqdm(two_shot, leave=True)

    for ix, batch in enumerate(loop):

        # to make sure the same example to predict has the correct label 
        # find the point where the question to be evaluate starts
        point = two_shot[ix].find('Questão 3:\n')
        
        # "11+point" ---> is the start of the query-text, so this is a naive approach
        if enem_data[ix]['query'] == two_shot[ix][11+point:]:
            inputs = tokenizer_bloom.encode(two_shot[ix], return_tensors="pt", max_length=2048).to(device)
            outputs = BLOOM_model.generate(inputs)
            
            preds.append(tokenizer_bloom.decode(outputs[0]))
            trues.append(enem_data[ix]['gold'])
        else:
            # if the query-text is different from inputs to be evaluated 
            # we print for further investigation
            print(point, ix)

Experimento deterministico, seed: 2711
Existe 1 GPU            Tesla T4 disponível.


  0%|          | 0/118 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Input length of input_ids is 1149, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 1141, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 1023, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 1143, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of

In [11]:
dataframe = pd.DataFrame({'true': trues, 'pred':preds})
dataframe.pred.apply(lambda x: x[-1:])

0      D
1      C
2      B
3      D
4      D
5      C
6      A
7      D
8      A
9      D
10     D
11     D
12     D
13     D
14     C
15     A
16     D
17     D
18     A
19     C
20     D
21     D
22     D
23     D
24     D
25     A
26     C
27     D
28     A
29     B
30     D
31     D
32     D
33     A
34     D
35     D
36     C
37     A
38     D
39     C
40     D
41     D
42     D
43     D
44     E
45     D
46     A
47     D
48     D
49     D
50     A
51     D
52     D
53     D
54     C
55     D
56     C
57     A
58     D
59     C
60     D
61     D
62     B
63     B
64     D
65     D
66     A
67     D
68     E
69     A
70     D
71     D
72     C
73     B
74     D
75     D
76     A
77     D
78     B
79     D
80     D
81     D
82     D
83     D
84     D
85     D
86     D
87     D
88     D
89     D
90     D
91     D
92     A
93     D
94     D
95     D
96     D
97     D
98     D
99     D
100    D
101    D
102    D
103    D
104    D
105    D
106    D
107    D
108    D
109    D
110    D
1

In [12]:
bloom_results, acc = get_results(trues, preds)

print(f'ACC of 2-shot Bloomz-7b1-mt on ENEM-2022: {acc:.3}')
# ACC of Bloomz-7b1-mt: 

bloom_results

ACC of 2-shot Bloomz-7b1-mt on ENEM-2022: 0.339


Unnamed: 0,true,pred
0,C,D
1,C,C
2,C,B
3,D,D
4,C,D
5,C,C
6,E,A
7,B,D
8,A,A
9,C,D
