In [1]:
# API KEY: sk-82F0fUi3hSi_drnaRG9Z72jEbFbIqPdYHHClASd7kET3BlbkFJJqxRZ40hP-p3XxjBMCQK3qUT40AmziQLi9Vz5WIcIA
# ORG ID: org-rjJCwF3sTsfOnw3I2Eaj4zdP
import json
import re
import math
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

# This notebook prepares our dataset before fine-tuning. This stage
# focuses on preprocessing our dataset by utilizing various techniques

In [2]:

# general function for preprocessing data
def preprocess(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [3]:
# reads kaggle toxic classification challenge dataset
df = pd.read_csv('../data/train.csv')
df = df.sample(frac=0.2, random_state=42) # chooses a random 20% of dataset
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
119105,7ca72b5b9c688e9e,"Geez, are you forgetful! We've already discus...",0,0,0,0,0,0
131631,c03f72fd8f8bf54f,Carioca RFA \r\n\r\nThanks for your support on...,0,0,0,0,0,0
125326,9e5b8e8fc1ff2e84,"""\r\n\r\n Birthday \r\n\r\nNo worries, It's wh...",0,0,0,0,0,0
111256,5332799e706665a6,Pseudoscience category? \r\n\r\nI'm assuming t...,0,0,0,0,0,0
83590,dfa7d8f0b4366680,"(and if such phrase exists, it would be provid...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
156920,d5ab78002a95480c,I also just noticed: he simultaneously went af...,0,0,0,0,0,0
121162,8837ad52121033bc,""" Would you claim them to be part of the """"ig...",0,0,0,0,0,0
34019,5ac2cc7bc20cc0cc,"The lyrics is found in the German version, so ...",0,0,0,0,0,0
83938,e09583af9fd6534e,Encyclopedia Titanica references do not source...,0,0,0,0,0,0


In [4]:
# includes only needed columns: text and toxic values
toxic_classification = df[['comment_text', 'toxic']]
toxic_classification['comment_text'] = toxic_classification['comment_text'].apply(preprocess) # applies preprocessing function
toxic_classification.columns = ['Text', 'Toxic'] # renames column names
toxic_classification

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toxic_classification['comment_text'] = toxic_classification['comment_text'].apply(preprocess)


Unnamed: 0,Text,Toxic
119105,Geez forgetful Weve already discussed Marx ana...,0
131631,Carioca RFA Thanks support request adminship f...,0
125326,Birthday worries Enjoy ur daytalke,0
111256,Pseudoscience category Im assuming article pse...,0
83590,phrase exists would provided search engine eve...,0
...,...,...
156920,also noticed simultaneously went articles fren...,0
121162,Would claim part ignorant majority,0
34019,lyrics found German version assume usable,0
83938,Encyclopedia Titanica references source origin...,0


In [5]:
# saves preprocessed data for later usage
toxic_classification.to_csv('../data/processed_toxic_classification_dataset_full.csv', index=False)

In [7]:
# creates and saves shorter version of preprocesses data for training comparison
short = toxic_classification[:1000]
short.to_csv('../data/processed_toxic_classification_dataset_short.csv', index=False)

In [15]:
training_json = []

# appends text and toxic values to a
# readable format for GPT-40 model
for index, row in toxic_classification.iterrows():
    training_json.append({
        "messages" : [
            {
                "role": "system", 
                "content": "Your job is to detect toxicity from a non-biased viewpoint and label prompted content as either toxic or non-toxic."
            },
            {
                "role": "user",
                "content": row['Text']
            },
            {
                "role": "assistant",
                "content": str(row['Toxic'])
            },
        ]
    })

In [23]:
# half length of dataset
half = math.floor(len(training_json) / 2)
half

7978

In [25]:
# trains first half and validates second half
training = training_json[:half]
validation = training_json[half + 1:]

In [16]:
# saves training and validation data to jsonl format for finetuning and validation
with open('../data/training_kaggle_toxic_classification_dataset.jsonl', 'w') as f:
    for entry in training:
        f.write(f"{json.dumps(entry)}\n")

with open('../data/validation_kaggle_toxic_classification_dataset.jsonl', 'w') as f:
    for entry in validation:
        f.write(f"{json.dumps(entry)}\n")