In [1]:
import os
import openai
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI

# This notebook tests the GPT-4o model in it's original state before being fine-tuned
# to help analyze it's original performance and compare it with the fine-tuned version

In [11]:
# retrieving API Key for fine-tuned GPT-4o model
load_dotenv()
FINE_TUNED_TOXIC_DETECTION_API_KEY = os.getenv("FINE_TUNED_TOXIC_DETECTION_API_KEY")
REGULAR_TOXIC_DETECTION_API_KEY = os.getenv("REGULAR_TOXIC_DETECTION_API_KEY")

In [12]:
# instantiating client with API key
general = OpenAI(
    api_key=REGULAR_TOXIC_DETECTION_API_KEY
)

In [None]:
# retrieving shortened preprocessed version of kaggle toxic classification challenge dataset
df = pd.read_csv('../data//csv/processed_toxic_classification_dataset_short.csv')
df['Toxic'] = df['Toxic'].apply(lambda x: str(np.int64(x)))
df

In [13]:
original = pd.DataFrame(columns=['Text', 'Toxic'])

In [14]:
# processing original GPT-4o model across entire dataset
for index, row in df.iterrows():
    completion = general.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system", 
                "content": "Your job is to detect toxicity from a non-biased viewpoint and label prompted content. ONLY GIVE THE VALUE 1 FOR TOXIC OR 0 FOR NON-TOXIC"
            },
            {
                "role": "user",
                "content": row['Text']
            }
        ]
    )
    original.loc[index] = [row['Text'], completion.choices[0].message.content]


In [7]:
original

Unnamed: 0,Text,Toxic
0,Geez forgetful Weve already discussed Marx ana...,0
1,Carioca RFA Thanks support request adminship f...,0
2,Birthday worries Enjoy ur daytalke,0
3,Pseudoscience category Im assuming article pse...,0
4,phrase exists would provided search engine eve...,0
...,...,...
995,2006 UTC rest us ever gone past 3RR good one V...,0
996,Yay lets Pedantic Semantics dance rolls eyes t...,0
997,supposed Know,0
998,guys really discuss napoleon need get fuckin l...,1


In [15]:
# comparing results of model to dataset
compare = original['Toxic'] == df['Toxic']
gptfour_original_accuracy = compare.values.sum() / compare.size

In [16]:
compare

0      True
1      True
2      True
3      True
4      True
       ... 
995    True
996    True
997    True
998    True
999    True
Name: Toxic, Length: 1000, dtype: bool

In [17]:
# GPT-4o model accuracy
# around 94.20% with too detailed evaluation prompt
print(f"Accuracy: {gptfour_original_accuracy * 100:.2f}%")

Accuracy: 94.20%
