In [None]:
from tqdm import tqdm
import pandas as pd, numpy as np
import os, time, pickle

os.chdir('/home/jovyan/work/')

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [None]:
outdatadir = os.path.join('personas','mistraldata_llm_4')

In [None]:
from llama_cpp import Llama
import outlines
import gc

In [None]:
def get_prediction(prompt, generator, verbose = False):
    answer = generator(prompt)
    if verbose == True:
        print(prompt)
        print(answer)
    return answer   

def load_llm(model_, choices):
    llm = Llama(
        model_path = 'models/'+model_[1],
        n_gpu_layers = -1, 
        logits_all = False,
        verbose = False,
        # temp = 0
    )
    model = outlines.models.LlamaCpp(llm)
    generator = outlines.generate.choice(model, choices)
    return generator

def load_model(model_):
    llm = Llama(
        model_path = 'models/'+model_[1],
        n_gpu_layers = -1,
        logits_all = False,
        verbose = False,
        # temp = 0
    )
    return llm

def annotate_data(prompts, generator):
    llm_labels = []
    for prompt in prompts:
        llm_labels.append(get_prediction(prompt, generator))
    return llm_labels

In [None]:
mistral_dir = 'ggml-mistral-model-7B-f16.gguf'
mistral_name = 'Mistral_7B'
mistral_ = (mistral_name, mistral_dir)

choices_binary = ['TRUE', 'FALSE']
choices_scale = ['1', '2', '3', '4', '5']

In [None]:
generator_scale = load_llm(mistral_, choices_scale)

In [None]:
def prompt_mistral_persona_binary(persona, post):
    return f"<TASK> Your task is to annotate the post shown below for toxicity, taking on the perspective of the following persona: {persona}. Respond with 'TRUE' if this post seems toxic/hateful/disrespectful to you and 'FALSE' if this post does not seem toxic/hateful/disrespectful to you. Respond with 'TRUE' or 'FALSE' only.\n<POST> {post}\n<RESPONSE> "

def prompt_mistral_nopersona_binary(post):
    return f"<TASK> Your task is to annotate the post shown below for toxicity. Respond with 'TRUE' if this post seems toxic/hateful/disrespectful to you and 'FALSE' if this post does not seem toxic/hateful/disrespectful to you.\n<POST> {post}\n<RESPONSE> "


def prompt_mistral_persona_scale(persona, post):
    return f"<TASK> Your task is to annotate the post shown below, taking on the perspective of the following persona: {persona}. Answer the following question on a scale from 1 to 5, where 1 stands for 'Not at all' and 5 stands for 'Very much so'. How toxic/hateful/disrespectful or offensive does this post seem to you?\n<POST> {post}\n<RESPONSE> "

def prompt_mistral_nopersona_scale(post):
    return f"<TASK> Your task is to annotate the post shown below. Answer the following question on a scale from 1 to 5, where 1 stands for 'Not at all' and 5 stands for 'Very much so'. How toxic/hateful/disrespectful or offensive does this post seem to you?\n<POST> {post}\n<RESPONSE> "

In [None]:
pd_data = pd.read_pickle('personas/data_ext/data_study_4.pkl')

with open('personas/data_ext/dict_annotators.pkl', 'rb') as f:
    dict_annotators = pickle.load(f)

In [None]:
for k,v in dict_annotators.items():
    print(k,len(v))

In [None]:
dict_annotators['neutral_black']['persona'] = [t.replace('an african-american','a black').replace('african-american', 'black') for t in dict_annotators['neutral_black']['persona']]

In [None]:
n_batches = 5
batch_size = 50
start_time = time.time()

prompt_func = prompt_mistral_persona_scale

for i in range(n_batches): # 5 batches a 50 personas
    k_count = 0
    for k,v in dict_annotators.items(): # 3 different dicts
        personas = zip(list(v['personaId'])[i*batch_size:(i+1)*batch_size], list(v['persona'])[i*batch_size:(i+1)*batch_size])
        if i == 0:
            pd_ = pd_data.copy()
        else:
            pd_ = pd.read_pickle(os.path.join(outdatadir,f'{k}_annotations.pkl'))
        texts = list(pd_['text'])
        for persona in personas:
            list_prompts = [prompt_func(persona[1], text) for text in texts]
            pd_[f'persona_{persona[0]}'] = annotate_data(list_prompts, generator_scale)
        pd_.to_pickle(os.path.join(outdatadir,f'{k}_annotations.pkl'))
        k_count += 1
        n_done = i * batch_size * len(dict_annotators.keys()) + k_count * batch_size
        n_todo = n_batches * batch_size * len(dict_annotators.keys()) - n_done
        time_done = time.time() - start_time
        time_todo = (time_done / n_done) * n_todo
        with open('personas/mistraldata_llm_4/monitor.txt', 'a') as f: # after each batch of 50 personas
            f.write(f'done: {n_done}, to-do: {n_todo}, time elapsed: {np.round(time_done/(60*60),4)}h, eta: {np.round(time_todo/(60*60),4)}h\n')