In [None]:
import os
import pandas as pd
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time
import json

In [None]:
nas_dir = "/home/jaejoong/cocoanlab02"
result_dir = os.path.join(nas_dir, "projects/AIDA/results")
config_dir = os.path.join(nas_dir, "projects/AIDA/config")
prompt_dir = os.path.join(nas_dir, "projects/AIDA/prompts")

In [None]:
all_df = pd.read_csv(os.path.join(result_dir, "AIDA_all_df.csv"))

In [None]:
with open(os.path.join(config_dir, "HuggingFace_token.txt"), 'r') as file:
    HF_token = file.read()
login(token=HF_token)

model_id = "meta-llama/Llama-3.1-70B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [None]:
with open(os.path.join(prompt_dir, "AIDA_Prompts.txt"), 'r') as file:
    prompt = json.load(file)
    
print("\n\n-----\n\n".join([f"## {qtype}\n\n**Prompt**\n{qs["Prompt"]}\n\n**Question**\n{"\n".join(["- " + q for q in qs["Question"]])}" for qtype, qs in prompt.items()]))

In [None]:
all_resp = []
time_start = time.time()
print(f"\rDialogue {0} / {len(D1_df)}, {list(prompt.keys())[0]} {0} / {len(list(prompt.values())[0]["Question"])}, elapsed time: {time.strftime("%H:%M:%S", time.gmtime(0))}       ", end="", flush=True)

for idx, row in all_df.iterrows():

    all_resp.append({})

    for qtype, qs in prompt.items():
        
        all_resp[idx][qtype] = []
        prompt = qs["Prompt"].replace("[INSERT_INTERVIEW]", row["Dialogue"])

        for qidx, q in enumerate(qs["Question"]):
        
            msg_list = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt.replace("[INSERT_QUESTION]", q)},
            ]
            
            input_ids = tokenizer.apply_chat_template(
                msg_list,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(model.device)
            
            torch.manual_seed(42)
            with torch.no_grad():
                outputs = model.generate(
                    input_ids,
                    max_new_tokens=1,
                    eos_token_id=[
                        tokenizer.eos_token_id,
                        tokenizer.convert_tokens_to_ids("<|eot_id|>")
                    ],
                    pad_token_id=tokenizer.eos_token_id,
                    do_sample=True,
                    temperature=1e-4,
                    top_p=0,
                )
            
            response = outputs[0][input_ids.shape[-1]:]
            decoded_response = tokenizer.decode(response, skip_special_tokens=True)
            all_resp[idx][qtype].append(decoded_response)
        
            print(f"\rDialogue {idx+1} / {len(alldat_df)}, {qtype} {qidx+1} / {len(qs["Question"])}, elapsed time: {time.strftime("%H:%M:%S", time.gmtime(time.time() - time_start))}       ", end="", flush=True)

In [None]:
all_resp_df = pd.DataFrame([[float(r3) for r2 in r1.values() for r3 in r2] for r1 in all_resp])
numq = {k: len(v) for k, v in all_resp[0].items()}
all_resp_df.columns = [f"Q{i+1}" for i in range(numq["PHQ8"] + numq["Ling"] + numq["CDS"])] + [f"NQ{i+1}" for i in range(numq["Neu"])] + [f"DQ{i+1}" for i in range(numq["DepSev"])]
display(all_resp_df)

In [None]:
all_resp_df.to_csv(os.path.join(result_dir, "AIDA_all_resp_df.csv"), index=False)