In [1]:
# API KEY: sk-82F0fUi3hSi_drnaRG9Z72jEbFbIqPdYHHClASd7kET3BlbkFJJqxRZ40hP-p3XxjBMCQK3qUT40AmziQLi9Vz5WIcIA
# ORG ID: org-rjJCwF3sTsfOnw3I2Eaj4zdP
import json
import re
import math
import pandas as pd
from algorithms import *
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

#### Necessary pip installation commands include:
- pip install pandas
- pip install nltk
#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key


In [None]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes certain punctuation and special characters that are less important
    no_punctuation_and_specials = re.sub(r"[^\w\s\.\,\!\?\']", "", str(data))
    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # returns tokenized text in sentence format
    return " ".join(tokens)

In [None]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns: text and toxic values
    data = data[['comment_text', 'toxic']]
    data.loc[:,'comment_text'] = data['comment_text'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Text', 'Toxic'] # renames column names
    return data

In [16]:
# preprocesses two datesets, combines them, and randomizes their order
def combined_randomized_preprocessing(data_one, data_two):
    data_one_pre = preprocess(data_one)
    data_two_pre = preprocess(data_two)

    # randomizes row organization within data
    complete_data = pd.concat([data_one_pre, data_two_pre]).sample(frac=1, random_state=42).reset_index(drop=True)

    return complete_data

In [None]:
def check_repetition(text):
  """Counts the number of repeated words in a string."""

  words = text.lower().split()
  word_counts = Counter(words)
  repeated_words = {word: count for word, count in word_counts.items() if count > 1}
  for value in repeated_words:
    if repeated_words[value] >= 25:
      return True
  return False


In [18]:
# reads kaggle toxic classification challenge dataset
df = pd.read_csv('../data/csv/train.csv')
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [19]:
training_size = 2500
evaluation_size = 500

# filters out for specific label value and removes
# error introducing repetitive comments
toxic_set = df[df['toxic'] == 1 &
    ~df['comment_text'].apply(check_repetition)
].sample(n=(training_size + evaluation_size),random_state=42)
non_toxic_set = df[df['toxic'] == 0 &
    ~df['comment_text'].apply(check_repetition)
].sample(n=(training_size + evaluation_size),random_state=42)

In [20]:
# split into toxic and non-toxic training and validation sets
toxic_train, toxic_eval = toxic_set[:training_size], toxic_set[training_size:].sample(frac=1, random_state=42).reset_index(drop=True)
non_toxic_train, non_toxic_eval = non_toxic_set[:training_size], non_toxic_set[training_size:].sample(frac=1, random_state=42).reset_index(drop=True)

In [21]:
toxic_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
145663,2040644271b7cd54,shut your mouth.........you grabage dump,1,0,0,0,1,0
133605,cac3c5d735c3e2ee,"it is a constructive edit you idiot, every kid...",1,0,1,0,1,0
79118,d3c53368eac8f6e1,"SlimVirgin, show me the racial insult that I h...",1,0,0,0,0,0
57756,9a90d25059773747,"Cumulus Cloud: get a life, you loser \r\n\r\nC...",1,0,0,0,0,0
3867,0a579853eec7eec3,""" North Korea actually gained approximately 23...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
133448,c9ee25227958cc1b,you dirty dirty hypocrite - there you are lect...,1,0,0,0,1,0
40967,6d55911850a8c655,jews and Iasi\r\nduring the iasi history many ...,1,0,1,0,1,1
108248,42a4fe697cc4a043,"""\r\n\r\nu are 1 ****ing cheeky **** mate i sw...",1,0,0,0,0,0
158480,ee8f29acbafd7148,Comment\r\n\r\nWhat the heck are you talking a...,1,0,0,0,0,0


In [22]:
# create training and validation sets
full_training_set = combined_randomized_preprocessing(toxic_train, non_toxic_train)
full_evaluation_set = combined_randomized_preprocessing(toxic_eval, non_toxic_eval)


In [None]:
# saves preprocessed fine_tuning data for later usage
full_training_set.to_csv('../data/csv/toxic_classification_training_set.csv', index=False)

In [None]:
# saves preprocessed evaluation data to csv file
full_evaluation_set.to_csv('../data/csv/toxic_classification_evaluation_set.csv', index=False)