In [1]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer
from huggingface_hub import login
import torch
import json
import pandas as pd

teique_prompt = lambda x: f"""### Instructions:
Please respond to each statement below by selecting the option that best reflects your degree of agreement or disagreement with that statement. Do not think too long about the exact meaning of the statements. Work quickly and try to answer as accurately as possible. There are no right or wrong answers. There are seven possible responses to each statement, ranging from ‘Completely Disagree’ (number 1) to ‘Completely Agree’ (number 7).
### Options:
1. Completely Disagree
2. Strongly Disagree 
3. Weakly Disagree 
4. Neither Agree nor Disagee 
5. Weakly Agree 
6. Strongly Agree 
7. Completely Agree

### Statement:
{x}

Select one response from the given options.
### Response:
"""

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

access_token = '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'
login(token=access_token)
os.environ["CUDA_VISIBLE_DEVICES"]= "1"

# 'gemma2_27b': 'google/gemma-2-27b-it',
# model_id = 'meta-llama/Meta-Llama-3.1-70B-Instruct'
# model_id =  'mistralai/Mixtral-8x7B-Instruct-v0.1'
model_id = 'google/gemma-2-27b-it'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    # bnb_4bit_use_double_quant=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=torch.bfloat16,
    # llm_int8_enable_fp32_cpu_offload=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id, 
#                                              offload_folder='/home/gauneg/emotion_experiments/offload',
#                                              quantization_config=bnb_config, 
#                                              device_map='auto')

gemma_model =  AutoModelForCausalLM.from_pretrained(model_id, 
                                                ## for gemma only------------------
                                                load_in_8bit=False,
                                                torch_dtype=torch.bfloat16,
                                                attn_implementation="eager",
                                                ## -----------------------------------
                                                offload_folder='/home/gauneg/emotion_experiments/offload',
                                                quantization_config=bnb_config,
                                                device_map='auto')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/gauneg/.cache/huggingface/token
Login successful


Loading checkpoint shards: 100%|██████████| 12/12 [00:32<00:00,  2.73s/it]


In [3]:
tei_df = pd.read_csv('../teique-sf.csv')

In [4]:
accumulated_res = []
for idx, que in tei_df[['TQN', 'Question']].values:
    prompt_inst_que = teique_prompt(que)
    toks = tokenizer(prompt_inst_que, return_tensors="pt")
    model_gen = gemma_model.generate(toks['input_ids'].to('cuda'), 
                                attention_mask=toks['attention_mask'].to('cuda'),
                                max_new_tokens=128, 
                                pad_token_id=tokenizer.eos_token_id, do_sample=False)
    pred_str = tokenizer.decode(model_gen[0], skip_special_tokens=True)
    final_process = pred_str.split('### Response:\n')[-1]
    final_pred = final_process.split('\n')[0]
    accumulated_res.append({'id': idx, 
                            'prompt': prompt_inst_que, 
                            'question': que,
                            'complete_gen': pred_str,
                            'proc_gen': final_pred})

In [5]:
len(accumulated_res)

30

In [6]:
res_folder = './tei_answers/json_res'
model_name = model_id.split('/')[-1]
with open(os.path.join(res_folder, f'{model_name}.json'), 'w+') as f:
    json.dump(accumulated_res, f)

In [2]:
open_llms_preds = "./tei_answers/json_res/"

"""
k:v (for dictionary)
model_name : [[id, question, proc_res, num]]
"""
res_collect = {}
for file in os.listdir(open_llms_preds):
    k_name = file.replace('.json', '')
    res_collect[k_name] = []

    with open(os.path.join(open_llms_preds, file), 'r') as f:
        results = json.load(f)

    for res in results:
        marked_num = res['proc_gen'].split('.')[0]
        res_collect[k_name].append([res['id'], 
                                    res['question'].strip(), 
                                    res['proc_gen'], 
                                    marked_num])

In [3]:
res_path = './tei_answers/csv_res/'
for k, v in res_collect.items():
    fpath = os.path.join(res_path, f'{k}.csv')
    df_temp = pd.DataFrame(v, columns=[
        'indx', 'ques', 'proc_res', 'num_res'
    ])
    df_temp.to_csv(fpath, index=False)

In [23]:
with open('./res_teique_gpt.json') as f:
    gpt_res = json.load(f)

In [29]:
rows_gpt = [[k, que, resp, resp.split('.')[0]] for k, que, pmt, resp in gpt_res]
    

In [32]:
csv_result = pd.DataFrame(rows_gpt, columns=[
        'indx', 'ques', 'proc_res', 'num_res'
    ])
csv_result.to_csv('./tei_answers/csv_res/gpt_4o_mini.csv', index=False)

In [2]:
dir_path = './tei_answers/csv_res/'
df_gpt = pd.read_csv(os.path.join(dir_path, 'gpt_4o_mini.csv'))
df_meta_llama = pd.read_csv(os.path.join(dir_path, 'Meta-Llama-3.1-70B-Instruct.csv'))
df_mixtral = pd.read_csv(os.path.join(dir_path, 'Mixtral-8x7B-Instruct-v0.1.csv'))

In [16]:
ids_ques = df_mixtral[['indx', 'ques']]

In [10]:
res_gpt_fin = df_gpt[['indx', 'num_res']]
res_mix_fin = df_mixtral[['indx', 'num_res']]
res_llama_fin = df_meta_llama[['indx', 'num_res']]


In [13]:
join_res_1 = res_gpt_fin.merge(res_mix_fin, on=['indx'], suffixes=['_gpt_4o_mini', '_mixtral'])

In [21]:
join_res_1_1 = join_res_1.merge(res_llama_fin, on=['indx'], suffixes=['', '_llama'])
join_res_2_1 = join_res_1_1.merge(ids_ques, on=['indx'])
fin_df = join_res_2_1.rename({'num_res':'num_res_llama'}, axis='columns')

In [24]:
fin_df[['indx', 'ques', 'num_res_gpt_4o_mini', 'num_res_mixtral', 'num_res_llama']].to_csv('./tei_answers/csv_res/all_tei_preds.csv', index=False)