In [1]:
import os
import openai
import tiktoken
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    precision_score
)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#### Necessary pip installation commands include:
- pip install os
- pip install openai
- pip install pandas
- pip install numpy
- pip install python-dotenv


#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key

In [2]:
# retrieving dataset utilized for evaluation
df = pd.read_csv('../../data/csv/toxic_classification_evaluation_set.csv')
df['Toxic'] = df['Toxic'].apply(lambda x: str(np.int64(x)))
df

Unnamed: 0,Text,Toxic
0,believe section Polynomials Maximal LFSRs remo...,0
1,best source Dzierzon biography L Brożek Jan Dz...,0
2,First cover current run Batista cover second i...,0
3,Reviewer Hello account granted reviewer userri...,0
4,Either wikipedia letting anyone become adminis...,1
...,...,...
995,didnt say anything fight Niteshift need put ch...,1
996,P4k Wikipedia delete things people worked worl...,1
997,Shii act intimidation concept intimidation mea...,0
998,would also like say masturbate also gay lol,1


In [3]:
# retrieving API Key from OpenAI platform
load_dotenv()
# Initialize the tokenizer for GPT-3.5 Turbo and GPT-4
tokenizer = tiktoken.get_encoding("cl100k_base")
input_token_cost = ((2.50 / 1000000))
output_token_cost = ((10.00 / 1000000))
REGULAR_TOXIC_DETECTION_API_KEY = os.getenv("REGULAR_TOXIC_DETECTION_API_KEY")

In [4]:
# instantiating client with API key
general = OpenAI(
    api_key=REGULAR_TOXIC_DETECTION_API_KEY
)

In [5]:
# creating container for holding model completions for comparison
original = pd.DataFrame(columns=['Text', 'Toxic'])
tokens = pd.DataFrame(columns=['Prompt', 'Input', 'Output', 'Total'])

In [6]:
evaluation_prompt = ''
# Read the evaluation prompt from the text file with utf-8 encoding
with open("../../data/text/prompt_engineering/gpt-4-two-shot-prompt.txt", "r", encoding="utf-8") as file:
    evaluation_prompt += file.read()

In [7]:
evaluation_prompt

'Classify these comments as either toxic (1) non-toxic (0). Toxic comments contain offensive, harmful, or abusive language. Label 1 for toxic and 0 for non-toxic\n\nExamples for Reference:\n\nText: “I disagree with your point, but I respect your view.”\n\nLabel: 0\n\nText: “Go back to where you came from.”\n\nLabel: 1\n\nClassify the following comment:'

In [8]:
# processing original GPT-4o model across entire dataset with advanced prompt
for index, row in df.iterrows():
    completion = general.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system", 
                "content": evaluation_prompt
            },
            {
                "role": "user",
                "content": row['Text']
            }
        ]
    )
    original.loc[index] = [row['Text'], completion.choices[0].message.content]
    # tokenize the string
    tokens_object = {
        'prompt': len(tokenizer.encode(evaluation_prompt)),
        'input': len(tokenizer.encode(row['Text'])),
        'output': len(tokenizer.encode(completion.choices[0].message.content))
    }
    total_tokens = tokens_object['prompt'] + tokens_object['input'] + tokens_object['output']
    tokens.loc[index] = [tokens_object['prompt'], tokens_object['input'], tokens_object['output'], total_tokens]


In [9]:
tokens

Unnamed: 0,Prompt,Input,Output,Total
0,86,37,1,124
1,86,52,1,139
2,86,32,1,119
3,86,105,1,192
4,86,14,1,101
...,...,...,...,...
995,86,25,1,112
996,86,34,1,121
997,86,78,1,165
998,86,9,1,96


In [10]:
original

Unnamed: 0,Text,Toxic
0,believe section Polynomials Maximal LFSRs remo...,0
1,best source Dzierzon biography L Brożek Jan Dz...,0
2,First cover current run Batista cover second i...,0
3,Reviewer Hello account granted reviewer userri...,0
4,Either wikipedia letting anyone become adminis...,1
...,...,...
995,didnt say anything fight Niteshift need put ch...,1
996,P4k Wikipedia delete things people worked worl...,1
997,Shii act intimidation concept intimidation mea...,0
998,would also like say masturbate also gay lol,1


In [11]:
# comparing results of model to dataset
compare = original['Toxic'] == df['Toxic']
gptfour_original_accuracy = compare.values.sum() / compare.size

In [12]:
compare

0      True
1      True
2      True
3      True
4      True
       ... 
995    True
996    True
997    True
998    True
999    True
Name: Toxic, Length: 1000, dtype: bool

In [13]:
# GPT-4o model accuracy with advanced prompted engineering (role prompting, two-shot examples) 90.20%
print(f"Accuracy: {gptfour_original_accuracy * 100:.2f}%")

Accuracy: 90.20%


In [14]:
# total token input cost: $0.30145750000000004
((tokens['Input'].sum() + tokens['Prompt'].sum()) * input_token_cost)

0.30145750000000004

In [15]:
# total token output cost: $0.010150000000000001
(tokens['Output'].sum() * output_token_cost)

0.010150000000000001

In [16]:
# complete total token cost: $0.31160750000000004
((tokens['Input'].sum() + tokens['Prompt'].sum()) * input_token_cost) + (tokens['Output'].sum() * output_token_cost)

0.31160750000000004

In [17]:
# 121598
tokens['Total'].sum()

121598