In [1]:
# API KEY: sk-82F0fUi3hSi_drnaRG9Z72jEbFbIqPdYHHClASd7kET3BlbkFJJqxRZ40hP-p3XxjBMCQK3qUT40AmziQLi9Vz5WIcIA
# ORG ID: org-rjJCwF3sTsfOnw3I2Eaj4zdP
import json
import re
import math
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

#### Necessary pip installation commands include:
- pip install pandas
- pip install nltk
#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key


In [2]:

# general function for preprocessing data
def preprocess(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [3]:
# reads kaggle toxic classification challenge dataset
df = pd.read_csv('./data/csv/train.csv')
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [25]:
toxic_set = df[df['toxic'] == 1].sample(n=10500,random_state=42)
non_toxic_set = df[df['toxic'] == 0].sample(n=10500,random_state=42)

In [28]:
toxic_train, toxic_eval = toxic_set[:10000], toxic_set[10000:].sample(frac=1, random_state=42).reset_index(drop=True)
non_toxic_train, non_toxic_eval = non_toxic_set[:10000], non_toxic_set[10000:].sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
# includes only needed columns: text and toxic values
toxic_classification = df[['comment_text', 'toxic']]
toxic_classification['comment_text'] = toxic_classification['comment_text'].apply(preprocess) # applies preprocessing function
toxic_classification.columns = ['Text', 'Toxic'] # renames column names
toxic_classification

Unnamed: 0,Text,Toxic
119105,"Geez, are you forgetful! We've already discus...",0
131631,Carioca RFA \r\n\r\nThanks for your support on...,0
125326,"""\r\n\r\n Birthday \r\n\r\nNo worries, It's wh...",0
111256,Pseudoscience category? \r\n\r\nI'm assuming t...,0
83590,"(and if such phrase exists, it would be provid...",0
...,...,...
156920,I also just noticed: he simultaneously went af...,0
121162,""" Would you claim them to be part of the """"ig...",0
34019,"The lyrics is found in the German version, so ...",0
83938,Encyclopedia Titanica references do not source...,0


In [5]:
# saves preprocessed data for later usage
toxic_classification.to_csv('./data/csv/processed_toxic_classification_dataset_full.csv', index=False)

In [6]:
# creates and saves shorter version of preprocesses data for training comparison
short = toxic_classification[:1000]
short.to_csv('./data/csv/processed_toxic_classification_dataset_short.csv', index=False)

In [7]:
training_json = []

# appends text and toxic values to a
# readable format for GPT-40 model
for index, row in toxic_classification.iterrows():
    training_json.append({
        "messages" : [
            {
                "role": "system", 
                "content": "Your job is to detect toxicity from a non-biased viewpoint and label prompted content as either toxic or non-toxic."
            },
            {
                "role": "user",
                "content": row['Text']
            },
            {
                "role": "assistant",
                "content": str(row['Toxic'])
            },
        ]
    })

In [8]:
# half length of dataset
half = math.floor(len(training_json) / 2)
half

15957

In [9]:
# trains first half and validates second half
training = training_json[:half]
validation = training_json[half + 1:]

In [10]:
# saves training and validation data to jsonl format for finetuning and validation
with open('./data/jsonl/training_kaggle_toxic_classification_dataset.jsonl', 'w') as f:
    for entry in training:
        f.write(f"{json.dumps(entry)}\n")

with open('./data/jsonl/validation_kaggle_toxic_classification_dataset.jsonl', 'w') as f:
    for entry in validation:
        f.write(f"{json.dumps(entry)}\n")