# Initialisation
---

In [None]:
!pip install -q vllm
!pip install -q outlines

In [None]:
%env VLLM_WORKER_MULTIPROC_METHOD=spawn

In [4]:
import os
import numpy as np
import utils.s3helpers as s3

from tqdm.auto import tqdm
from contextlib import contextmanager

import torch
from datasets import load_dataset
import multiprocessing
from typing import List, Tuple
from time import time
from datetime import datetime
from vllm import LLM, SamplingParams
from outlines import models, generate

In [None]:
from huggingface_hub import login
hf_token = ""
login(token=hf_token)
print("\n")

---
# Answer generation

In [6]:
# read from s3 the previously created dataframe with the 12.400.000 prompts (200k persona * 62 political statements)
df = s3.read_s3_parquet(f"Persona/data/interim/pct_persona_prompts_righta.pqt")

In [None]:
print(len(df))
print(df.columns)

In [None]:
print(df.iloc[61]['prompt'])

In [None]:
prompts = df['prompt'].tolist()
print(len(prompts))

In [None]:
# initialising which model to use and where to save the resulting dataframe
# mistralai/Mistral-7B-Instruct-v0.3 - meta-llama/Llama-3.1-8B-Instruct","HuggingFaceH4/zephyr-7b-beta","Qwen/Qwen2.5-7B-Instruct"]

model_id = "zephyr-7b-beta"
model = "HuggingFaceH4/zephyr-7b-beta"
print(f"Using: {model}")

output_prefix = f"Persona/output/persona_distribution/{model_id}/right_authoritarian_personas"

In [None]:
NUM_GPUS = torch.cuda.device_count()
print(f"Number of GPUs: {NUM_GPUS}")

In [12]:
# code to split the batch of prompts, instanciate the LLMs over the GPUs and run the inference in parallel
def run_inference_one_gpu(gpu_id, prompt_list, model_name):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    prompts_array = np.array(prompt_list)
    llm = LLM(
        model=model_name, 
        tokenizer_mode="auto", # auto - mistral
        trust_remote_code=True,
        enable_chunked_prefill=True,
        # enable_prefix_caching=True,
    )
    
    model = models.VLLM(llm)
    generator_choice = generate.choice(model, ["Disagree", "Agree", "Strongly disagree", "Strongly agree"])
    output = generator_choice(prompts_array)
    
    return output


split_list = lambda l, n: [l[i * len(l) // n: (i + 1) * len(l) // n] for i in range(n)]

def run_inference_multi_gpu(model_name, prompts):
    split_prompts = split_list(prompts, NUM_GPUS)
    inputs = [(i, p, model_name) for i, p in enumerate(split_prompts)]

    with multiprocessing.Pool(processes=NUM_GPUS) as pool:
        results = pool.starmap(run_inference_one_gpu, inputs)

    outputs = []
    for result in results:
        outputs.extend(result)

    return outputs

In [None]:
# code that start the inference process (I decided to split every 10000 personas so that I can check the output every now and then without waiting for the whole process to finish to discover errors, to try out stuff you can also set it lower (10))
%%capture

if __name__ == '__main__':
    model_name = model
    batch_size = 10000 # peronas per batch
    number_personas = len(prompts)//62
    number_of_batches = number_personas // batch_size

    for i in tqdm(range(13, number_of_batches)):
        start_idx = i * (batch_size * 62)
        end_idx = start_idx + (batch_size * 62)
        sub_df = df[start_idx:end_idx].copy()
        sub_prompts = sub_df['prompt'].tolist()

        raw_responses = run_inference_multi_gpu(model_name, sub_prompts)
        sub_df['response'] = raw_responses
        print(f"Saving batch number {i}")
        s3.write_s3_parquet(sub_df, f"{output_prefix}/sub_dfs_stanced/df_b{i}_p{batch_size}.pqt")
        print(f"Data saved to: {output_prefix}/sub_dfs_stanced/df_b{i}_p{batch_size}.pqt")

Single persona prompt

In [None]:
if __name__ == '__main__':
    model_name = model
    raw_responses = run_inference_multi_gpu(model_name, prompts)
    df['response'] = raw_responses
    s3.write_s3_parquet(df, f"{output_prefix}/compass_autr.pqt")
    print(f"Data saved to: {output_prefix}/compass_autr.pqt")