In [1]:

import pandas as pd
import random
import time

from LLM.gpt import send_requestGPT
from LLM.gemini import send_requestGemini
from LLM.llama import send_requestLlama

  from .autonotebook import tqdm as notebook_tqdm


Check how many unique texts there are in the dataset

In [None]:
def unique_values_MASTER():
    df = pd.read_csv("./data/preprocessed/MASTER.csv")
    unique_text_count = df['text'].nunique()
    print(f"Number of unique values in 'text' column: {unique_text_count}")

In [7]:
# Count remaining Nan values in MASTER
df = pd.read_csv("./data/result/MASTER.csv")
# print for each column ['gpt-4o-mini','gpt-4-0125-preview','gpt-3.5-turbo-0125','llama3.2-3b','llama3.1-8b','llama3-8b','gemini-1.5-flash',]]
print(f"Remaining Texts 'gpt-4o-mini' column: \t\t {df['gpt-4o-mini'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'gpt-4-0125-preview' column: \t {df['gpt-4-0125-preview'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'gpt-3.5-turbo-0125' column: \t {df['gpt-3.5-turbo-0125'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'llama3.2-3b' column: \t\t {df['llama3.2-3b'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'llama3.1-8b' column: \t\t {df['llama3.1-8b'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'llama3-8b' column: \t\t {df['llama3-8b'].isnull().sum()} out of {len(df)}")
print(f"Remaining Texts 'gemini-1.5-flash' column: \t {df['gemini-1.5-flash'].isnull().sum()} out of {len(df)}")

# print total remaining 
print(f"Total remaining texts: {df['gpt-4o-mini'].isnull().sum() + df['gpt-4-0125-preview'].isnull().sum() + df['gpt-3.5-turbo-0125'].isnull().sum() + df['llama3.2-3b'].isnull().sum() + df['llama3.1-8b'].isnull().sum() + df['llama3-8b'].isnull().sum() + df['gemini-1.5-flash'].isnull().sum()} out of {len(df)*7}")


Remaining Texts 'gpt-4o-mini' column: 		 0 out of 6080
Remaining Texts 'gpt-4-0125-preview' column: 	 0 out of 6080
Remaining Texts 'gpt-3.5-turbo-0125' column: 	 0 out of 6080
Remaining Texts 'llama3.2-3b' column: 		 0 out of 6080
Remaining Texts 'llama3.1-8b' column: 		 0 out of 6080
Remaining Texts 'llama3-8b' column: 		 0 out of 6080
Remaining Texts 'gemini-1.5-flash' column: 	 0 out of 6080
Total remaining texts: 0 out of 42560


In [3]:
def populate_row(model):
    
    df = pd.read_csv("./data/result/MASTER.csv")
    system_prompt = "You are an assistant trained to identify if text contains sexism. Answer ONLY with '1' for Yes or '0' for No. ONLY CHECK FOR SEXISM AND NOT OTHER FORMS OF HATE SPEECH."
    
    nan_rows = df[df[model].isna()]

    if not nan_rows.empty:
        sequential_index = nan_rows.index[0]
        text_value = df.at[sequential_index, 'text']
        sexism = df.at[sequential_index, 'SEXISM']
        if "gemini" in model:
            try:
                value = send_requestGemini(model, system_prompt, text_value)
            except Exception as e:
                if "429" not in str(e):
                    value = -1
                else:
                    value = None
        elif "llama" in model:
            value = send_requestLlama(model, system_prompt, text_value)            
        else:
            value = send_requestGPT(model, system_prompt, text_value)
        df.at[sequential_index, model] = value
        print(f"Populated NaN at index {sequential_index + 1} with value {value} vs. {sexism} for text: {text_value}") 
    else:
        print(f"No NaN values found in {model} column.")

    df.to_csv("./data/result/MASTER.csv", index=False)


In [None]:
# model options: "gpt-4o-mini", "gpt-4-0125-preview", "gpt-3.5-turbo-0125"
for i in range(1000):
    print(str(i) + ": ")
    populate_row("gpt-4o-mini")

In [None]:
# LLAMA 
for i in range(1000):
    populate_row("llama3.2-3b")

In [4]:
# GEMINI
for i in range(500):
    populate_row("gemini-1.5-flash")
    time.sleep(4) # to avoid rate limiting

Populated NaN at index 5230 with value 0 vs. 0 for text: @user conveniently forget shithole countries  bad hombres on? man jekyl @url
Populated NaN at index 5231 with value 0 vs. 0 for text: gab retard went school shooting im going reminded next 30 years
Populated NaN at index 5234 with value 0 vs. 0 for text: @user @user shut fuck faggot
Populated NaN at index 5235 with value 0 vs. 0 for text: @user fuck nigga whoever stole shit year faggot ass niggas dont fuck 2019 nig @url
Populated NaN at index 5236 with value 0 vs. 0 for text: rondo mcgee retarded
Populated NaN at index 5237 with value 1 vs. 1 for text: @user @user shes feminazi linebacker like you?
Populated NaN at index 5238 with value 1 vs. 0 for text: twitter allows #redneck #president racist sexist tries stop #retard cant eat @url
Populated NaN at index 5239 with value 1 vs. 0 for text: kasey fucking dyke someone please get twt
Populated NaN at index 5240 with value 0 vs. 0 for text: @user @user @user bad - rise shithole coun

# PARALLELISM

In [3]:
from concurrent.futures import ThreadPoolExecutor
def populate_row_for_models(row):
    system_prompt = "You are an assistant trained to identify if text contains sexism. Answer ONLY with '1' for Yes or '0' for No. ONLY CHECK FOR SEXISM AND NOT OTHER FORMS OF HATE SPEECH."
    
    # Get text and other details
    text_value = row['text']
    
    # Process each model's NaN separately
    for model in ['gpt-4-0125-preview','gpt-3.5-turbo-0125','llama3.2-3b','llama3.1-8b','llama3-8b']:
        if pd.isna(row[model]):
            if "gemini" in model:
                try:
                  row[model] = send_requestGemini(model, system_prompt, text_value)
                except:
                  row[model] = -1
            elif "llama" in model:
                row[model] = send_requestLlama(model, system_prompt, text_value)
            else:
                row[model] = send_requestGPT(model, system_prompt, text_value)
            
    print(f"Row {row['ID']} processed")
    
    return row

# Main function to populate rows with NaN in parallel
def populate_rows_in_parallel(max_rows):
    df = pd.read_csv("./data/result/MASTER.csv")

    # Filter rows with NaN in any model column
    rows_with_nan = df[df[['gpt-4o-mini','gpt-4-0125-preview','gpt-3.5-turbo-0125','llama3.2-3b','llama3.1-8b','llama3-8b']].isna().any(axis=1)]
    
    
    rows_with_nan = rows_with_nan.head(max_rows)

    # Process each row in parallel and update the DataFrame
    with ThreadPoolExecutor() as executor:
        updated_rows = list(executor.map(populate_row_for_models, [row for _, row in rows_with_nan.iterrows()]))

    # Update the original DataFrame with the modified rows
    for updated_row in updated_rows:
        df.loc[df['ID'] == updated_row['ID'], updated_row.index] = updated_row.values

    # Save the updated DataFrame back to the CSV
    df.to_csv("./data/result/MASTER.csv", index=False)

In [6]:
for i in range(10):
    populate_rows_in_parallel(50)
    print(f"Processed {50*(i+1)} rows")
    time.sleep(60)
    

Row 5800 processed
Row 5855 processed
Row 5854 processed
Row 5858 processed
Row 5856 processed
Row 5850 processed
Row 5853 processed
Row 5852 processed
Row 5857 processed
Row 5851 processed
Row 5859 processed
Row 5860 processed
Row 5862 processed
Row 5863 processed
Row 5861 processed
Row 5864 processed
Row 5869 processed
Row 5867 processed
Row 5868 processed
Row 5866 processed
Row 5871 processed
Row 5865 processed
Row 5870 processed
Row 5872 processed
Row 5876 processed
Row 5875 processed
Row 5873 processed
Row 5877 processed
Row 5880 processed
Row 5878 processed
Row 5879 processed
Row 5874 processed
Row 5881 processed
Row 5883 processed
Row 5882 processed
Row 5887 processed
Row 5884 processed
Row 5886 processed
Row 5888 processed
Row 5890 processed
Row 5893 processed
Row 5889 processed
Row 5894 processed
Row 5892 processed
Row 5885 processed
Row 5891 processed
Row 5895 processed
Row 5896 processed
Row 5897 processed
Row 5898 processed
Processed 50 rows
Row 5909 processed
Row 5910 proc