In [None]:

import pandas as pd
import random
import time

from LLM.gpt import send_requestGPT
from LLM.gemini import send_requestGemini
from LLM.llama import send_requestLlama

Check how many unique texts there are in the dataset

In [None]:
def unique_values_MASTER():
    df = pd.read_csv("./data/preprocessed/MASTER.csv")
    unique_text_count = df['text'].nunique()
    print(f"Number of unique values in 'text' column: {unique_text_count}")

In [None]:
# Count remaining Nan values in MASTER
df = pd.read_csv("./data/result/MASTER.csv")
# print for each column ['gpt-4o-mini','gpt-4-0125-preview','gpt-3.5-turbo-0125','llama3.2-3b','llama3.1-8b','llama3-8b','gemini-1.5-flash',]]
print(f"Remaining Texts 'gpt-4o-mini' column: \t\t {df['gpt-4o-mini'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'gpt-4-0125-preview' column: \t {df['gpt-4-0125-preview'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'gpt-3.5-turbo-0125' column: \t {df['gpt-3.5-turbo-0125'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'llama3.2-3b' column: \t\t {df['llama3.2-3b'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'llama3.1-8b' column: \t\t {df['llama3.1-8b'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'llama3-8b' column: \t\t {df['llama3-8b'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'gemini-1.5-flash' column: \t {df['gemini-1.5-flash'].isnull().sum()} out of {len(df)}")


In [None]:
def populate_row(model):
    
    df = pd.read_csv("./data/result/MASTER.csv")
    system_prompt = "You are an assistant trained to identify if text contains sexism. Answer ONLY with '1' for Yes or '0' for No. ONLY CHECK FOR SEXISM AND NOT OTHER FORMS OF HATE SPEECH."
    
    nan_rows = df[df[model].isna()]

    if not nan_rows.empty:
        random_index = random.choice(nan_rows.index)
        text_value = df.at[random_index, 'text']
        sexism = df.at[random_index, 'SEXISM']
        if "gemini" in model:
            value = send_requestGemini(model, system_prompt, text_value)
        elif "llama" in model:
            value = send_requestLlama(model, system_prompt, text_value)            
        else:
            value = send_requestGPT(model, system_prompt, text_value)
        df.at[random_index, model] = value
        print(f"Populated NaN at index {random_index + 1} with value {value} vs. {sexism} for text: {text_value}") 
    else:
        print(f"No NaN values found in {model} column.")

    df.to_csv("./data/result/MASTER.csv", index=False)


In [None]:
# model options: "gpt-4o-mini", "gpt-4-0125-preview", "gpt-3.5-turbo-0125"
for i in range(1000):
    print(str(i) + ": ")
    populate_row("gpt-4o-mini")

In [None]:
# LLAMA 
for i in range(1000):
    populate_row("llama3.2-3b")

In [None]:
# GEMINI
for i in range(1000):
    try:
        populate_row("gemini-1.5-flash")
    except Exception as e:
        print(f"Error: {e}")
    time.sleep(4) # to avoid rate limiting

# PARALLELISM

In [None]:
from concurrent.futures import ThreadPoolExecutor
def populate_row_for_models(row):
    system_prompt = "You are an assistant trained to identify if text contains sexism. Answer ONLY with '1' for Yes or '0' for No. ONLY CHECK FOR SEXISM AND NOT OTHER FORMS OF HATE SPEECH."
    
    # Get text and other details
    text_value = row['text']
    sexism = row['SEXISM']
    
    # Process each model's NaN separately
    for model in ['gpt-4o-mini','gpt-4-0125-preview','gpt-3.5-turbo-0125','llama3.2-3b','llama3.1-8b','llama3-8b']:
        if pd.isna(row[model]):
            if "gemini" in model:
                try:
                  row[model] = send_requestGemini(model, system_prompt, text_value)
                except:
                  row[model] = -1
            elif "llama" in model:
                row[model] = send_requestLlama(model, system_prompt, text_value)
            else:
                row[model] = send_requestGPT(model, system_prompt, text_value)
            
    print(f"Row {row['ID']} processed")
    
    return row

# Main function to populate rows with NaN in parallel
def populate_rows_in_parallel(max_rows):
    df = pd.read_csv("./data/result/MASTER.csv")

    # Filter rows with NaN in any model column
    rows_with_nan = df[df[['gpt-4o-mini','gpt-4-0125-preview','gpt-3.5-turbo-0125','llama3.2-3b','llama3.1-8b','llama3-8b']].isna().any(axis=1)]

    rows_with_nan = rows_with_nan.head(max_rows)

    # Process each row in parallel and update the DataFrame
    with ThreadPoolExecutor() as executor:
        updated_rows = list(executor.map(populate_row_for_models, [row for _, row in rows_with_nan.iterrows()]))

    # Update the original DataFrame with the modified rows
    for updated_row in updated_rows:
        df.loc[df['ID'] == updated_row['ID'], updated_row.index] = updated_row.values

    # Save the updated DataFrame back to the CSV
    df.to_csv("./data/result/MASTER.csv", index=False)

In [None]:
populate_rows_in_parallel(10)