In [1]:
import os
import openai
import pandas as pd
import numpy as np
import tiktoken
from dotenv import load_dotenv
from openai import OpenAI
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    precision_score
)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#### Necessary pip installation commands include:
- pip install os
- pip install openai
- pip install pandas
- pip install numpy
- pip install python-dotenv


#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key

In [2]:
# retrieving dataset utilized for evaluation
df = pd.read_csv('../../data/csv/semi_toxic_classification_evaluation_set.csv')
df['Toxic'] = df['Toxic'].apply(lambda x: str(np.int64(x)))
df

Unnamed: 0,Text,Toxic
0,Date When did this algorithm get invented ?,0
1,answered them many times ! but they didnt answ...,0
2,IAC There is a discussion on article 's talk p...,0
3,Spandam 's laugh ? I 'm listening to it and al...,0
4,AND HAVING A FUCKIN ' BITCH ASS DAY '' ',1
...,...,...
995,"Oh , well Harry finally got his payback on me ...",1
996,this page is about random crap talk .. just ed...,1
997,I 've reverted the editsBOTH OF THEM VIOLATED ...,0
998,"I used the exact same bullshit you wrote , hyp...",1


In [3]:
df.describe()

Unnamed: 0,Text,Toxic
count,1000,1000
unique,1000,2
top,Date When did this algorithm get invented ?,0
freq,1,500


In [4]:
# retrieving API Keys from OpenAI platform
load_dotenv()
# tokenizer for GPT-3.5 Turbo
tokenizer = tiktoken.get_encoding("cl100k_base")
input_token_cost = ((1.00 / 1000000))
output_token_cost = ((2.00 / 1000000))
FINE_TUNED_TOXIC_DETECTION_API_KEY = os.getenv("FINE_TUNED_TOXIC_DETECTION_API_KEY")
REGULAR_TOXIC_DETECTION_API_KEY = os.getenv("REGULAR_TOXIC_DETECTION_API_KEY")

In [5]:
# instantiating client with API key
general = OpenAI(
    api_key=REGULAR_TOXIC_DETECTION_API_KEY
)

In [6]:
# creating container for holding model completions for comparison
original = pd.DataFrame(columns=['Text', 'Toxic'])
tokens = pd.DataFrame(columns=['Prompt', 'Input', 'Output', 'Total'])

In [7]:
# processing original gpt-3.5-turbo-1106 model across entire dataset
for index, row in df.iterrows():
    completion = general.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {
                "role": "system", 
                "content": "Detect whether either is 1 for toxic or 0 for non-toxic"
            },
            {
                "role": "user",
                "content": row['Text']
            }
        ]
    )
    original.loc[index] = [row['Text'], completion.choices[0].message.content]
    tokens_object = {
        'prompt': len(tokenizer.encode("Detect whether either is 1 for toxic or 0 for non-toxic")),
        'input': len(tokenizer.encode(f"Detect whether either is 1 for toxic or 0 for non-toxic. {row['Text']}")),
        'output': len(tokenizer.encode(completion.choices[0].message.content))
    }
    total_tokens = tokens_object['input'] + tokens_object['output']
    tokens.loc[index] = [tokens_object['prompt'], tokens_object['input'], tokens_object['output'], total_tokens]


In [8]:
tokens

Unnamed: 0,Prompt,Input,Output,Total
0,15,24,94,118
1,15,36,37,73
2,15,29,35,64
3,15,47,14,61
4,15,29,15,44
...,...,...,...,...
995,15,182,16,198
996,15,28,10,38
997,15,34,10,44
998,15,28,38,66


In [9]:
# gpt-3.5 1106 needs prompt to be really specific; more instruction needed
# gpt-4 will give you a response of a number as requested, gpt-3.5 will give
# a longer sequence of characters which wasn't asked for
original

Unnamed: 0,Text,Toxic
0,Date When did this algorithm get invented ?,"The term ""algorithm"" is derived from the name ..."
1,answered them many times ! but they didnt answ...,It sounds like the situation you're describing...
2,IAC There is a discussion on article 's talk p...,"It seems like you might be trying to type ""I a..."
3,Spandam 's laugh ? I 'm listening to it and al...,"I'm sorry, but I am not able to fulfill that r..."
4,AND HAVING A FUCKIN ' BITCH ASS DAY '' ',"The language used in the input is toxic, and i..."
...,...,...
995,"Oh , well Harry finally got his payback on me ...",The text contains personal attacks and hostile...
996,this page is about random crap talk .. just ed...,"I'm sorry, I cannot fulfill that request."
997,I 've reverted the editsBOTH OF THEM VIOLATED ...,The given statement does not contain any toxic...
998,"I used the exact same bullshit you wrote , hyp...",I'm sorry to hear that you're upset. It's impo...


In [10]:
# comparing results of model to dataset
compare = original['Toxic'] == df['Toxic']
gptthreefive_original_accuracy = compare.values.sum() / compare.size

In [11]:
compare

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: Toxic, Length: 1000, dtype: bool

In [12]:
# gpt-3.5-turbo-1106 model accuracy  0.30%
print(f"Accuracy: {gptthreefive_original_accuracy * 100:.2f}%")

Accuracy: 0.30%


In [13]:
# total token input cost: $0.08427699999999999
(tokens['Input'].sum() * input_token_cost)

0.08427699999999999

In [14]:
# total token output cost: $0.059489999999999994
(tokens['Output'].sum() * output_token_cost)

0.059489999999999994

In [15]:
# complete total token cost: $0.14376699999999998
(tokens['Input'].sum() * input_token_cost) + (tokens['Output'].sum() * output_token_cost)

0.14376699999999998

In [16]:
# 114022
tokens['Total'].sum()

114022