<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/ENEM_2022_CabritaJ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets
!pip install -q -U rank_bm25

In [None]:
import json
import torch
import random 
import numpy as np
import collections 
import rank_bm25

import pandas as pd

from tqdm.auto import tqdm

from peft import PeftModel
from transformers import GenerationConfig, AutoTokenizer, AutoModelForCausalLM 
from sklearn.metrics import accuracy_score
from datasets import load_dataset

if torch.cuda.is_available(): 
    device = 'cuda'  
else: 
    device ='cpu'

MANUAL_SEED = 2711
rnd = random.Random()
rnd.seed(MANUAL_SEED)
def deterministic(rep=True, manual_seed=MANUAL_SEED):
    if rep:
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Experimento deterministico, seed: {manual_seed}')
        if device == 'cuda':
            print(f'Existe {torch.cuda.device_count()} GPU\
            {torch.cuda.get_device_name(0)} disponível.')
    else:
        print('Experimento randomico')
deterministic()    

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = AutoModelForCausalLM.from_pretrained(
    "22h/cabritaJ_step_78k", 
    use_auth_token="secrete_KEY",
    load_in_8bit=True, 
    device_map="auto", 
    low_cpu_mem_usage=True,
    pad_token_id=tokenizer.eos_token_id
    )
model.eval()
print("Model-Loaded!")

# ENEM

In [None]:
def ignore_question(doc):
    filters = {
        'IU': False,
        # 'MR': False,  # uncomment to filter out MR
        # 'CE': False,  # uncomment to filter out CE
        'ML': False,
    }
    for k,v in filters.items():
        if doc[k] != v:
            return True
    return False

def _process_doc_cot(doc):
    def format_example(doc, choices):
        prompt = "Enunciado: " + doc["context"] + "\n"
        prompt += doc["question"] + "\nAlternativas:\n"
        for choice, option in zip(choices, doc["options"]):
            prompt += f"{choice.upper()}. {option}\n"
        
        prompt += "##Resposta: " + doc.get("explanation", "")
        return prompt.strip()
    choices = ['a', 'b', 'c', 'd', 'e']
    return {
        "query": format_example(doc, choices),
        "choices": doc["options"],
        "gold": choices.index(doc["label"]),
        "id": doc["id"],
        "exam": doc["exam"],
    }    

dataset = collections.defaultdict(list)
        
data_path = "/content/drive/MyDrive/LLMs/ENEM/ENEMdataset/2022.json"
with open(data_path) as f:
    documents = json.load(f)

documents = list(filter(lambda doc: not ignore_question(doc), documents))
dataset['test'] = list(map(_process_doc_cot, documents))

enem_data = {ix:doc  for ix, doc in enumerate(dataset['test'])}

print(f"tamanho dataset: {len(enem_data)}")

enem_data[0]    

def dynamic_similar_prompt(doc_id, data, topk=3):
    key_predict = [k for k, v in enem_data.items() if v['id'].startswith(doc_id)][0]
    query_list = [v['query'] for k,v in data.items()]
    tokenized_corpus = [doc.split(" ") for doc in query_list]
    bm25 = rank_bm25.BM25Plus(tokenized_corpus)

    query = data[key_predict]['query']
    tokenized_query = query.split(" ")
    
    doc_scores = bm25.get_scores(tokenized_query)
    
    # [1:] remove query with itself ---> topk+1
    top_similar_idxs = list(doc_scores.argsort()[-(topk+1):][::-1])[1:]
    if key_predict in top_similar_idxs: # k in topk_sim ---> must be false
        print('Error BM25 prompt similar retrieval')
    
    return top_similar_idxs


def dynamyc_fewshot_examples(num_fewshot, doc_id, sim_prompt=True, data=enem_data):
    if sim_prompt:
        topk_sim = dynamic_similar_prompt(doc_id, data, topk=num_fewshot)
        fewshot_ex = [enem_data[k] for k in topk_sim]

    else:
        # filter the sample in current predict from data
        all_possible_fewshot_keys = [k for k, v in data.items() if not v['id'].startswith(doc_id)]
        fewshot_keys = rnd.sample(all_possible_fewshot_keys, num_fewshot)
        fewshot_ex = [enem_data[k] for k in fewshot_keys]

    return fewshot_ex


def fewshot_context(doc, num_fewshot, similar_prompt=True, enem_data=enem_data):
    if num_fewshot == 0 and fewshotex is None:
        labeled_examples = ""
    else:
        fewshotex = dynamyc_fewshot_examples(
            num_fewshot=num_fewshot, doc_id=doc['id'], sim_prompt=similar_prompt, data=enem_data
        )
        
        labeled_examples = "Dado o exemplo, forneça a resposta após ##Resposta:\nNão faça nenhum comentário adicional" + "\n"
        for i, doc_ex in enumerate(fewshotex):
            labeled_examples += f'Questão {i+1}:\n'
            labeled_examples += doc_ex['query'] + " " + ['A.', 'B.', 'C.', 'D.', 'E.'][doc_ex['gold']].upper()
            labeled_examples += '\n##\n'
        labeled_examples += f'Questão {len(fewshotex) + 1}:\n'

    example = doc['query']
    
    return labeled_examples + example

# -------------------------------------------------------------------------------------- #
# number of fewshots, must be > 0

# create inputs ---> list with fewshot samples + question to be answer 
one_shot, one_shot_sim, two_shot, two_shot_sim = [],[],[],[]
for k,doc in enem_data.items():
    one_shot.append(fewshot_context(doc=doc, num_fewshot=1, similar_prompt=False, enem_data=enem_data))
    two_shot.append(fewshot_context(doc=doc, num_fewshot=2, similar_prompt=False, enem_data=enem_data))
    
    one_shot_sim.append(fewshot_context(doc=doc, num_fewshot=1, similar_prompt=True, enem_data=enem_data))
    two_shot_sim.append(fewshot_context(doc=doc, num_fewshot=2, similar_prompt=True, enem_data=enem_data))

# - - - -    
one_shot_sim[0]    

tamanho dataset: 118


'Dado o exemplo, forneça a resposta após ##Resposta:\nNão faça nenhum comentário adicional\nQuestão 1:\nEnunciado: Criado há cerca de 20 anos na Califórnia, o mountainboard é um esporte de aventura que utiliza uma espécie de skate off-road para realizar manobras similares às das modalidades de snowboard, surf e do próprio skate. A atividade chegou ao Brasil em 1997 e hoje possui centenas de praticantes, um circuito nacional respeitável e mais de uma dezena de pistas espalhadas pelo país. Segundo consta na história oficial, o mountainboard foi criado por praticantes de snowboard que sentiam falta de praticar o esporte nos períodos sem neve. Para isso, eles desenvolveram um equipamento bem simples: uma prancha semelhante ao modelo utilizado na neve (menor e um pouco menos flexível), com dois eixos bem resistentes, alças para encaixar os pés e quatro pneus com câmaras de ar para regular a velocidade que pode ser alcançada em diferentes condições. Com essa configuração, o esporte se mostro

In [None]:
deterministic()    

map_label = {0:"A", 1:"B", 2:"C", 3:"D", 4:"E"}
trues, preds = [], []
ENEM_TO_ITERATE = one_shot_sim


for ix, batch in enumerate(ENEM_TO_ITERATE):
    inputs = tokenizer.encode(ENEM_TO_ITERATE[ix], return_tensors="pt", truncation=True, max_length=2048).to(device)
    outputs = model.generate(
        inputs, 
        generation_config=GenerationConfig(
                        temperature=0.1, 
                        max_new_tokens=8,
                        do_sample=True
                    ),
        )
    
    preds.append(tokenizer.decode(outputs[0]))
    trues.append(map_label[enem_data[ix]['gold']])
    print(f'{ix}\n\tPRED: {preds[ix].split("##Resposta:")[-1].strip()[0]} \n\tTRUE: {trues[ix]}')    

Experimento deterministico, seed: 2711
Existe 1 GPU            NVIDIA A100-SXM4-40GB disponível.
0
	PRED: B 
	TRUE: C
1
	PRED: E 
	TRUE: C
2
	PRED: E 
	TRUE: C
3
	PRED: D 
	TRUE: D
4
	PRED: E 
	TRUE: C
5
	PRED: C 
	TRUE: C
6
	PRED: E 
	TRUE: E
7
	PRED: D 
	TRUE: B
8
	PRED: B 
	TRUE: A
9
	PRED: A 
	TRUE: C
10
	PRED: E 
	TRUE: B
11
	PRED: A 
	TRUE: A
12
	PRED: D 
	TRUE: E
13
	PRED: E 
	TRUE: E
14
	PRED: E 
	TRUE: C
15
	PRED: A 
	TRUE: A
16
	PRED: E 
	TRUE: B
17
	PRED: E 
	TRUE: A
18
	PRED: E 
	TRUE: E
19
	PRED: D 
	TRUE: E
20
	PRED: B 
	TRUE: B
21
	PRED: E 
	TRUE: E
22
	PRED: A 
	TRUE: E
23
	PRED: E 
	TRUE: D
24
	PRED: E 
	TRUE: C
25
	PRED: E 
	TRUE: A
26
	PRED: E 
	TRUE: D
27
	PRED: B 
	TRUE: B
28
	PRED: A 
	TRUE: D
29
	PRED: D 
	TRUE: B
30
	PRED: C 
	TRUE: D
31
	PRED: E 
	TRUE: B
32
	PRED: E 
	TRUE: A
33
	PRED: E 
	TRUE: C
34
	PRED: A 
	TRUE: B
35
	PRED: A 
	TRUE: E
36
	PRED: C 
	TRUE: C
37
	PRED: A 
	TRUE: A
38
	PRED: C 
	TRUE: B
39
	PRED: E 
	TRUE: C
40
	PRED: C 
	TRUE: D
41
	PRED: B

In [None]:
clean_preds = [e.split("##Resposta:")[-1].strip()[0] for e in preds]
clean_preds

In [None]:
dataframe = pd.DataFrame({'true': trues, 'pred':clean_})
print(f"{len(dataframe[dataframe.true==dataframe.pred])/len(dataframe):.3}")
dataframe

0.271


Unnamed: 0,true,pred
0,C,B
1,C,E
2,C,E
3,D,D
4,C,E
...,...,...
113,B,A
114,C,C
115,A,C
116,B,D
