In [8]:
# API KEY: sk-82F0fUi3hSi_drnaRG9Z72jEbFbIqPdYHHClASd7kET3BlbkFJJqxRZ40hP-p3XxjBMCQK3qUT40AmziQLi9Vz5WIcIA
# ORG ID: org-rjJCwF3sTsfOnw3I2Eaj4zdP
import json
import re
import math
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

#### Necessary pip installation commands include:
- pip install pandas
- pip install nltk
#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key


In [9]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [10]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns: text and toxic values
    data = data[['comment_text', 'toxic']]
    data.loc[:,'comment_text'] = data['comment_text'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Text', 'Toxic'] # renames column names
    return data

In [11]:
def combined_randomized_preprocessing(data_one, data_two):
    data_one_pre = preprocess(data_one)
    data_two_pre = preprocess(data_two)

    complete_data = pd.concat([data_one_pre, data_two_pre]).sample(frac=1, random_state=42).reset_index(drop=True)

    return complete_data

In [12]:
def check_repetition(text):
  """Counts the number of repeated words in a string."""

  words = text.lower().split()
  word_counts = Counter(words)
  repeated_words = {word: count for word, count in word_counts.items() if count > 1}
  for value in repeated_words:
    if repeated_words[value] >= 150:
      return True
  return False


In [13]:
# reads kaggle toxic classification challenge dataset
df = pd.read_csv('./data/csv/train.csv')
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [14]:
training_size = 10000
evaluation_size = 500
toxic_set = df[df['toxic'] == 1 &
    ~df['comment_text'].apply(check_repetition)
].sample(n=(training_size + evaluation_size),random_state=42)
non_toxic_set = df[df['toxic'] == 0 &
    ~df['comment_text'].apply(check_repetition)
].sample(n=(training_size + evaluation_size),random_state=42)

In [15]:
toxic_train, toxic_eval = toxic_set[:training_size], toxic_set[training_size:].sample(frac=1, random_state=42).reset_index(drop=True)
non_toxic_train, non_toxic_eval = non_toxic_set[:training_size], non_toxic_set[training_size:].sample(frac=1, random_state=42).reset_index(drop=True)

In [16]:
toxic_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
18306,305445bb56244ce6,Ur Mom Kicks Skibbles,1,0,0,0,0,0
20859,370b5e98c3f1ef0c,"dumbass\r\ndont call my shit a garage band, yo...",1,0,1,0,1,0
28793,4c48f1a5e4138946,You need to stop changing everything Im doing ...,1,0,1,0,1,0
9685,19a5a823d7f2d857,-I don't think it should be 1 fucking week! -,1,0,1,0,0,0
78740,d2aa179222d13790,No worries mate. I hope my new signature is m...,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...
114996,66f448f21a64c810,"You are a bad liar, Kevin. And a cheat as well...",1,0,0,0,1,0
71010,be0cdab4cbd7a85d,yo will i am sorry \r\n\r\nwill i would like t...,1,0,1,0,1,1
59578,9f8b36a0b661dae7,Haha\r\nMoron.. I told you to shut up.. - \r...,1,0,1,0,1,0
6573,118ee46d69d1e00b,You would \r\n\r\nyou would too \r\n\r\nYou wo...,1,0,0,0,0,0


In [17]:
full_training_set = combined_randomized_preprocessing(toxic_train, non_toxic_train)
full_evaluation_set = combined_randomized_preprocessing(toxic_eval, non_toxic_eval)


In [None]:
# saves preprocessed fine_tuning data for later usage
full_training_set.to_csv('./data/csv/toxic_classification_training_set.csv', index=False)

In [None]:
# saves preprocessed evaluation data to csv file
full_evaluation_set.to_csv('./data/csv/toxic_classification_evaluation_set.csv', index=False)