In [1]:
import os
import re
import openai
import pandas as pd
import tiktoken
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import Pinecone
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    precision_score
)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#### Necessary pip installation commands include:
- pip install openai
- pip install pandas
- pip install tiktoken
- pip install numpy
- pip install python-dotenv
- pip install langchain
- pip install langchain-pinecone
- pip install langchain-core
- pip install langchain-openai

#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key

In [2]:
# retrieving dataset utilized for evaluation
df = pd.read_csv('../../data/csv/semi_toxic_classification_evaluation_set.csv')
df['Toxic'] = df['Toxic'].apply(lambda x: str(int(x)))
df

Unnamed: 0,Text,Toxic
0,Date When did this algorithm get invented ?,0
1,answered them many times ! but they didnt answ...,0
2,IAC There is a discussion on article 's talk p...,0
3,Spandam 's laugh ? I 'm listening to it and al...,0
4,AND HAVING A FUCKIN ' BITCH ASS DAY '' ',1
...,...,...
995,"Oh , well Harry finally got his payback on me ...",1
996,this page is about random crap talk .. just ed...,1
997,I 've reverted the editsBOTH OF THEM VIOLATED ...,0
998,"I used the exact same bullshit you wrote , hyp...",1


In [3]:
# retrieving API Key from OpenAI platform
load_dotenv()
# Initialize the tokenizer for GPT-3.5 Turbo and GPT-4
tokenizer = tiktoken.get_encoding("cl100k_base")
input_token_cost = ((1.00 / 1000000))
output_token_cost = ((2.00 / 1000000))
FINE_TUNED_TOXIC_DETECTION_API_KEY = os.getenv('FINE_TUNED_TOXIC_DETECTION_API_KEY')
REGULAR_TOXIC_DETECTION_API_KEY = os.getenv("REGULAR_TOXIC_DETECTION_API_KEY")
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index = os.getenv('PINECONE_GPT')

In [4]:
# instantiating client with API key
fine_tuned = OpenAI(
    api_key=FINE_TUNED_TOXIC_DETECTION_API_KEY
)

In [5]:
# initializing Pinecone vector database instance
docSearch = Pinecone(
    index_name=pc_index,
    embedding=OpenAIEmbeddings(openai_api_key=REGULAR_TOXIC_DETECTION_API_KEY)
)

  docSearch = Pinecone(


In [6]:
def remove_repetition(text, depth=0):
    # cap = 5
    # check = 0
    # split_line = line.split(' ')
    # new_split = []
    # curr = split_line[0]
    
    # for word in split_line:
    #     if word == curr:
    #         check = check + 1
    #     else:
    #         curr = word
    #         check = 0
            
    #     if check < cap:
    #         new_split.append(word)

    pattern = r'\b(\w+(?:\s+\w+)*)\b(?:\s+\1\b)+'
    new_text = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
    
    if new_text == text or depth > 10: 
        return new_text
    return remove_repetition(new_text, depth + 1)
    
    # return " ".join(new_split)

In [7]:
# appends one/few shot examples to evaluation prompt
def shot_additions(examples):
   evaluation_prompt = ''
   # Read the evaluation prompt from the text file with utf-8 encoding
   with open("../../data/text/rag_prompts/gpt-3-5-1106-two-shot-prompt.txt", "r", encoding="utf-8") as file:
        evaluation_prompt += file.read() 
   for example in examples:
       # separates example key and values
       split_ex = example.split(' - ')
       comment = split_ex[0]
       label = split_ex[1]
       evaluation_prompt += '\n\nText: \"' + comment + "\"" + '\n\n' + label

   evaluation_prompt += '\n\nClassify the following comment:'

   return evaluation_prompt

In [8]:
# queries vector database for custom,
# with similar examples to user prompt
def rag_prompt(text):
    two_shots = []
    # queries Pinecone database
    search_results = docSearch.max_marginal_relevance_search(text, k=5, fetch_k=15)
    for i in range(len(search_results)):
        content = search_results[i].page_content
        # prevents repetition which will cause errors within OpenAI
        if i != 250:
            two_shots.append(content)
        # two valid examples found
        if len(two_shots) == 2:
            break
    return shot_additions(two_shots)

In [9]:
# retrieving dataset utilized for evaluation
tuned = pd.DataFrame(columns=['Text', 'Toxic'])
tokens = pd.DataFrame(columns=['Prompt', 'Input', 'Output', 'Total'])

In [10]:
evaluation_prompt = ''
# Read the evaluation prompt from the text file with utf-8 encoding
with open("../../data/text/rag_prompts/gpt-3-5-1106-two-shot-prompt.txt", "r", encoding="utf-8") as file:
    evaluation_prompt += file.read()

In [11]:
evaluation_prompt

'Classify these comments as either toxic (1) non-toxic (0). Toxic comments contain offensive, harmful, or abusive language. Label 1 for toxic and 0 for non-toxic\n\nExamples for Reference:'

In [12]:
# processing fine-tuned GPT-3.5-turbo-1106 model across entire dataset 
# with advanced prompting with RAG support
for index, row in df.iterrows():
    text = row['Text']
    rag_eval = rag_prompt(remove_repetition(text))
    print(f"Starting Completion at Index: {index}")
    print(f'Text: {text}')
    completion = fine_tuned.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {
                "role": "system", 
                "content": rag_eval
            },
            {
                "role": "user",
                "content": text
            }
        ]
    )
    print(f'Finished Completion {index}!')
    tuned.loc[index] = [row['Text'], completion.choices[0].message.content]
        # tokenize the string
    tokens_object = {
        'prompt': len(tokenizer.encode(rag_eval)),
        'input': len(tokenizer.encode(row['Text'])),
        'output': len(tokenizer.encode(completion.choices[0].message.content))
    }
    total_tokens = tokens_object['prompt'] + tokens_object['input'] + tokens_object['output']
    tokens.loc[index] = [tokens_object['prompt'], tokens_object['input'], tokens_object['output'], total_tokens]


Starting Completion at Index: 0
Text: Date When did this algorithm get invented ?
Finished Completion 0!
Starting Completion at Index: 1
Text: answered them many times ! but they didnt answer me , they just doing their vandalism and changing the correct
Finished Completion 1!
Starting Completion at Index: 2
Text: IAC There is a discussion on article 's talk page .
Finished Completion 2!
Starting Completion at Index: 3
Text: Spandam 's laugh ? I 'm listening to it and all I hear is da . Was it written as wa in the manga or something ?
Finished Completion 3!
Starting Completion at Index: 4
Text: AND HAVING A FUCKIN ' BITCH ASS DAY '' '
Finished Completion 4!
Starting Completion at Index: 5
Text: I mean it . Please adjust this box to work with the rest of this page . And as for the time of these comments , ignore them . My computer clock is screwy .
Finished Completion 5!
Starting Completion at Index: 6
Text: I have added the 2003 invasion of Iraq , since from a technical stand point as w

In [13]:
tokens

Unnamed: 0,Prompt,Input,Output,Total
0,648,8,1,657
1,131,20,1,152
2,174,13,1,188
3,460,31,1,492
4,1342,13,1,1356
...,...,...,...,...
995,195,166,1,362
996,154,12,1,167
997,140,18,1,159
998,195,12,1,208


In [14]:
tuned

Unnamed: 0,Text,Toxic
0,Date When did this algorithm get invented ?,0
1,answered them many times ! but they didnt answ...,1
2,IAC There is a discussion on article 's talk p...,0
3,Spandam 's laugh ? I 'm listening to it and al...,0
4,AND HAVING A FUCKIN ' BITCH ASS DAY '' ',1
...,...,...
995,"Oh , well Harry finally got his payback on me ...",1
996,this page is about random crap talk .. just ed...,1
997,I 've reverted the editsBOTH OF THEM VIOLATED ...,1
998,"I used the exact same bullshit you wrote , hyp...",1


In [15]:
tuned = tuned[['Toxic']]
predictions = df[['Toxic']]

In [16]:
# comparing results of model to dataset
compare = tuned['Toxic'] == df['Toxic']
gptthreefive_original_accuracy = compare.values.sum() / compare.size

In [17]:
compare

0       True
1      False
2       True
3       True
4       True
       ...  
995     True
996     True
997    False
998     True
999     True
Name: Toxic, Length: 1000, dtype: bool

In [18]:
# fine-tuned GPT-3.5-turbo-0125 accuracy with advanced prompted engineering
# (role prompting, two-shot examples) and RAG for few-shot support 86.20%
# k=35, fetch_k=100 86.70%
# k=50, fetch_k=100 85.80%
# k=10, fetch_k=100 86.50%
# k=25, fetch_k=50
print(f"Accuracy: {gptthreefive_original_accuracy * 100:.2f}%")

Accuracy: 86.20%


In [19]:
# total token input cost: $0.345778
((tokens['Input'].sum() + tokens['Prompt'].sum()) * input_token_cost)

0.345778

In [20]:
# total token output cost: $0.002
(tokens['Output'].sum() * output_token_cost)

0.002

In [21]:
# complete total token cost: $0.347778
((tokens['Input'].sum() + tokens['Prompt'].sum()) * input_token_cost) + (tokens['Output'].sum() * output_token_cost)

0.347778

In [22]:
# 346778
tokens['Total'].sum()

346778