## Settings

In [6]:
import pandas as pd
import csv
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os
import torch
import re

In [7]:
os.environ["CUDA_VISIBLE_DEVICES"] ="1"

## Model

In [8]:
model_name = "google/flan-t5-large" 
# model_name = "google/flan-t5-xl"
directory = model_name.split("/")[1]

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


## Data Station

In [18]:
# Dictionary mapping dataset names to file paths and additional parameters

dataset_map = {
    "SX": { # sexism data of full train set 
        "filepath": "../../../data/sexism_dataset_3_classes_without_duplicates_train.csv",
        "sep": "\t",
        "usecols": ["label", "toxicity", "sentence"]
    },
    "SXM": { # sexism data of 50 most toxic ex
        "filepath": "../../../data/output_most_toxic_2024_06_11_53_sentences.csv",
        "sep": ",",
        "usecols": ["label", "toxicity", "sentence"]
    },
    "SXL": { # sexism data of 50 least toxic ex
        "filepath": "../../../data/output_least_toxic_2024_06_11_53_sentences.csv",
        "sep": ",",
        "usecols": ["label", "toxicity", "sentence"]
    },
    "CN": { # conan(islamophbic) data
        "filepath": "../../../data/conan_train.csv",
        "sep": "\t",
        "usecols": None
    }
}

def load_dataset(dataset_name, head=None):
    config = dataset_map[dataset_name]
    data = pd.read_csv(config["filepath"], sep=config["sep"], usecols=config["usecols"])
    if dataset_name != "CN":
        data.rename(columns={"label": "gold", "sentence": "text"}, inplace=True)
        # data = data.head(head)    
    return data

dataset_name="SXL"
data = load_dataset(dataset_name)
# data

## Prompts Generation

In [23]:
def read_prompt_template(prompt_template_filepath) -> dict: 
    with open(prompt_template_filepath, 'r') as file:
        content = file.read()
    # Use regular expressions to find keys and their corresponding values
    pattern = re.compile(r'(\w+):\n(.*?)\n\n', re.DOTALL)
    matches = pattern.findall(content)
    prompt_template_from_file = {key: value.strip() for key, value in matches}
    return prompt_template_from_file

prompt_templates = read_prompt_template("../../../main_prompts.txt")

def charge_example(prompt_template,texts): # charge sentences to prompt 
  prompts_charged = [prompt_template.format(sent=text) for text in texts]
  return prompts_charged

def turn_into_batch(prompts, batch_size):
  batch_idx = 0
  for i in range(0,len(prompts),batch_size):
    yield prompts[i:i+batch_size]
    batch_idx+=1
    print(f"Processing {batch_idx}th batch")



In [None]:
# prompt_templates

## Inference

In [24]:
generation_args = {
    "max_length":10000,
    # "max_new_tokens": 256,
    # "do_sample": True,
    "temperature": 0.01,
    "top_p": 0.9,
    "top_k": 1,
    # "repetition_penalty": 1.05,
    # "eos_token_id": [tokenizer.eos_token_id, 3200],
    # "output_logits": True,
    # "return_dict_in_generate": True,
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
    
def inference(batch_of_prompts) -> list :
  inputs = tokenizer(batch_of_prompts, return_tensors="pt", padding=True).to(device)
  outputs = model.generate(**inputs, **generation_args)
  decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
  return decoded_outputs #output for a batch in a ls

def store_result(out, prompting_type, directory, data, dataset_name):
  res_filename = directory + f"/TEST{dataset_name}_{len(out)}_{prompting_type}.csv"
  res = pd.concat([data, pd.DataFrame(out, columns=["pred"])], axis=1)
  res.to_csv(res_filename,index=False)
  print(res_filename)
  return res


## Run inference over all types of prompts

In [26]:
for prompting_type, prompt_template in prompt_templates.items():
  out = []
  prompts = charge_example(prompt_template,data["text"])
  out = [inf for batch in turn_into_batch(prompts,batch_size=500) for inf in inference(batch)]
  # res = store_result(out, prompting_type, directory)
  print(out)
  break




Processing 1th batch
['NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'YES', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO', 'NO']


In [18]:
model_name = "google/flan-t5-large" 
directory = model_name.split("/")[1]
prompt_templates = read_prompt_template(directory+"/prompts.txt")
print(directory+"/prompts.txt")


flan-t5-large/prompts.txt


In [15]:
def main():
    """Main function to test a prompt type chosen."""
    
    # Choose model size, daataset, prompting type
    model_name = "google/flan-t5-large" 
    # model_name = "google/flan-t5-xl" 
    dataset_name="CN"
    prompting_type = "P1EN_binary_v1"

    data = load_dataset(dataset_name)
    directory = model_name.split("/")[1]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    prompt_templates = read_prompt_template(directory+"/prompts.txt")
    prompts = charge_example(prompt_templates[prompting_type], data["text"])

    out = []
    for batch in turn_into_batch(prompts, batch_size=int(len(prompts) / 10)):
      out.extend(inference(batch))   
    res = store_result(out, prompting_type, directory, data, dataset_name)

if __name__=="__main__":
    main()





Processing 1th batch
Processing 2th batch
Processing 3th batch
Processing 4th batch
Processing 5th batch
Processing 6th batch
Processing 7th batch
Processing 8th batch
Processing 9th batch
Processing 10th batch
Processing 11th batch
flan-t5-large/TESTCN_874_P1EN_binary_v1.csv
