In [None]:
from tqdm import tqdm
import pandas as pd, numpy as np
import os, time, pickle

os.chdir('/home/jovyan/work/')

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [None]:
from llama_cpp import Llama
import outlines
import gc

In [None]:
def get_prediction(prompt, generator, verbose = False):
    answer = generator(prompt)
    if verbose == True:
        print(prompt)
        print(answer)
    return answer   

def load_llm(model_, choices):
    llm = Llama(
        model_path = 'models/'+model_[1],
        n_gpu_layers = -1, 
        logits_all = False,
        verbose = False,
        # temp = 0
    )
    model = outlines.models.LlamaCpp(llm)
    generator = outlines.generate.choice(model, choices)
    return generator

def load_model(model_):
    llm = Llama(
        model_path = 'models/'+model_[1],
        n_gpu_layers = -1,
        logits_all = False,
        verbose = False,
        # temp = 0
    )
    return llm

def annotate_data(prompts, generator):
    llm_labels = []
    for prompt in prompts:
        llm_labels.append(get_prediction(prompt, generator))
    return llm_labels

In [None]:
mistral_dir = 'ggml-mistral-model-7B-f16.gguf'
mistral_name = 'Mistral_7B'
mistral_ = (mistral_name, mistral_dir)

choices_binary = ['TRUE', 'FALSE']
choices_scale = ['1', '2', '3', '4', '5']

In [None]:
generator_scale = load_llm(mistral_, choices_scale)

In [None]:
def prompt_mistral_persona_binary(persona, post):
    return f"<TASK> Your task is to annotate the post shown below for toxicity, taking on the perspective of the following persona: {persona}. Respond with 'TRUE' if this post seems toxic/hateful/disrespectful to you and 'FALSE' if this post does not seem toxic/hateful/disrespectful to you. Respond with 'TRUE' or 'FALSE' only.\n<POST> {post}\n<RESPONSE> "

def prompt_mistral_nopersona_binary(post):
    return f"<TASK> Your task is to annotate the post shown below for toxicity. Respond with 'TRUE' if this post seems toxic/hateful/disrespectful to you and 'FALSE' if this post does not seem toxic/hateful/disrespectful to you.\n<POST> {post}\n<RESPONSE> "


def prompt_mistral_persona_scale(persona, post):
    return f"<TASK> Your task is to annotate the post shown below, taking on the perspective of the following persona: {persona}. Answer the following question on a scale from 1 to 5, where 1 stands for 'Not at all' and 5 stands for 'Very much so'. How toxic/hateful/disrespectful or offensive does this post seem to you?\n<POST> {post}\n<RESPONSE> "

def prompt_mistral_nopersona_scale(post):
    return f"<TASK> Your task is to annotate the post shown below. Answer the following question on a scale from 1 to 5, where 1 stands for 'Not at all' and 5 stands for 'Very much so'. How toxic/hateful/disrespectful or offensive does this post seem to you?\n<POST> {post}\n<RESPONSE> "

In [None]:
def run_llm(pd_, pd_restart, sample_personas, n, prompt_func, generator, out_dir, out_file):
    
    if isinstance(pd_restart, pd.DataFrame):
        pd_results = pd_restart.copy()
    else:
        pd_results = pd_.copy()

    if not n == None: # if random run
        all_prompts = [prompt_func(text) for text in list(pd_results['text'])]
        adder = int(pd_results.columns[-1].split('_')[1]) if len(pd_results.columns)>3 else 0
    else: # if persona run
        all_prompts = {}

    for i in range(len(sample_personas)):
        if n == None: # if persona run
            list_prompts = [prompt_func(sample_personas['persona'].values[i], text) for text in list(pd_results['text'])]     
            persona_ix = sample_personas['personaId'].values[i]
            pd_results[f'persona_{persona_ix}'] = annotate_data(list_prompts, generator)
            all_prompts[persona_ix] = list_prompts
        else: # if random run
            pd_results[f'run_{i+1+adder}'] = annotate_data(all_prompts, generator)

    if not os.path.exists(os.path.join('personas',out_dir)):
        os.mkdir(os.path.join('personas',out_dir))
    pd_results.to_pickle(os.path.join('personas',out_dir,out_file+'.pkl'))
    
    return pd_results

In [None]:
pd_data = pd.read_pickle('personas/data_ext/sscale_tweets.pkl')
pd_personas = pd.read_pickle('personas/data_ext/pd_personas_cleaned.pkl')

In [None]:
start_time = time.time()

for i in range(40):
    sample_personas = pd_personas.iloc[i*5000:(i+1)*5000,:]
    %time scale_personal_you = run_llm(pd_data, None, sample_personas, None, prompt_mistral_persona_scale, generator_scale, 'mistraldata_llm_3_4/runs', f'sscale_{(i+1)*5000}')
    with open('personas/mistraldata_llm_3_4/monitor.txt', 'a') as f:
        f.write(f'done: {5000*(i+1)}, to-do: {200000-5000*(i+1)}, time elapsed: {np.round((time.time()-start_time)/(60*60),4)}h, eta: {(np.round((time.time()-start_time)/(60*60),4)/(5000*(i+1)))*(200000-5000*(i+1))}h\n')