In [1]:
import os
import openai
import pandas as pd
import numpy as np
import tiktoken
from dotenv import load_dotenv
from openai import OpenAI
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    precision_score
)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#### Necessary pip installation commands include:
- pip install os
- pip install openai
- pip install pandas
- pip install numpy
- pip install python-dotenv


#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key

In [2]:
# retrieving dataset utilized for evaluation
df = pd.read_csv('../../data/csv/semi_toxic_classification_evaluation_set.csv')
df['Toxic'] = df['Toxic'].apply(lambda x: str(np.int64(x)))
df

Unnamed: 0,Text,Toxic
0,Date When did this algorithm get invented ?,0
1,answered them many times ! but they didnt answ...,0
2,IAC There is a discussion on article 's talk p...,0
3,Spandam 's laugh ? I 'm listening to it and al...,0
4,AND HAVING A FUCKIN ' BITCH ASS DAY '' ',1
...,...,...
995,"Oh , well Harry finally got his payback on me ...",1
996,this page is about random crap talk .. just ed...,1
997,I 've reverted the editsBOTH OF THEM VIOLATED ...,0
998,"I used the exact same bullshit you wrote , hyp...",1


In [3]:
# retrieving API Keys from OpenAI platform
load_dotenv()
# tokenizer for GPT-3.5 Turbo
tokenizer = tiktoken.get_encoding("cl100k_base")
input_token_cost = ((0.50 / 1000000))
output_token_cost = ((1.50 / 1000000))
FINE_TUNED_TOXIC_DETECTION_API_KEY = os.getenv("FINE_TUNED_TOXIC_DETECTION_API_KEY")
REGULAR_TOXIC_DETECTION_API_KEY = os.getenv("REGULAR_TOXIC_DETECTION_API_KEY")

In [4]:
# instantiating client with API key
general = OpenAI(
    api_key=REGULAR_TOXIC_DETECTION_API_KEY
)

In [5]:
# creating container for holding model completions for comparison
original = pd.DataFrame(columns=['Text', 'Toxic'])
tokens = pd.DataFrame(columns=['Prompt', 'Input', 'Output', 'Total'])

In [6]:
# processing original gpt-3.5-turbo-0125 model across entire dataset
for index, row in df.iterrows():
    completion = general.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[
            {
                "role": "system", 
                "content": "Detect whether either is 1 for toxic or 0 for non-toxic"
            },
            {
                "role": "user",
                "content": row['Text']
            }
        ]
    )
    original.loc[index] = [row['Text'], completion.choices[0].message.content]
    tokens_object = {
        'prompt': len(tokenizer.encode("Detect whether either is 1 for toxic or 0 for non-toxic")),
        'input': len(tokenizer.encode(f"Detect whether either is 1 for toxic or 0 for non-toxic. {row['Text']}")),
        'output': len(tokenizer.encode(completion.choices[0].message.content))
    }
    total_tokens = tokens_object['input'] + tokens_object['output']
    tokens.loc[index] = [tokens_object['prompt'], tokens_object['input'], tokens_object['output'], total_tokens]


In [None]:
tokens

Unnamed: 0,Prompt,Input,Output,Total
0,15,24,1,25
1,15,36,2,38
2,15,29,9,38
3,15,47,9,56
4,15,29,1,30
...,...,...,...,...
995,15,182,2,184
996,15,28,1,29
997,15,34,7,41
998,15,28,5,33


In [None]:
original

Unnamed: 0,Text,Toxic
0,Date When did this algorithm get invented ?,0
1,answered them many times ! but they didnt answ...,Toxic
2,IAC There is a discussion on article 's talk p...,This statement does not contain any toxic lang...
3,Spandam 's laugh ? I 'm listening to it and al...,This text does not contain any toxic content.
4,AND HAVING A FUCKIN ' BITCH ASS DAY '' ',1
...,...,...
995,"Oh , well Harry finally got his payback on me ...",Toxic
996,this page is about random crap talk .. just ed...,1
997,I 've reverted the editsBOTH OF THEM VIOLATED ...,Both sentences are non-toxic.
998,"I used the exact same bullshit you wrote , hyp...",Toxic: 1


In [None]:
original = original['Toxic']
predictions = df['Toxic']

In [None]:
# comparing results of model to dataset
compare = original == predictions
gptthreefive_original_accuracy = compare.values.sum() / compare.size

In [None]:
compare

0       True
1      False
2      False
3      False
4       True
       ...  
995    False
996     True
997    False
998    False
999    False
Name: Toxic, Length: 1000, dtype: bool

In [None]:
# gpt-3.5-turbo-0125 model accuracy 28.70%
print(f"Accuracy: {gptthreefive_original_accuracy * 100:.2f}%")

Accuracy: 28.70%


In [None]:
# total token input cost: $0.042138499999999995
(tokens['Input'].sum() * input_token_cost)

0.042138499999999995

In [None]:
# total token output cost: $0.011628
(tokens['Output'].sum() * output_token_cost)

0.011628

In [None]:
# complete total token cost: $0.053766499999999995
(tokens['Input'].sum() * input_token_cost) + (tokens['Output'].sum() * output_token_cost)

0.053766499999999995

In [None]:
# 92029
tokens['Total'].sum()

92029