In [35]:
import json
import re
import math
import pandas as pd
from algorithms import *
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

#### Necessary pip installation commands include:
- pip install pandas
- pip install nltk
#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key

In [36]:
def combined_randomized_preprocessing(data_one, data_two):
    data_one_pre = sugarai_preprocess(data_one)
    data_two_pre = sugarai_preprocess(data_two)

    complete_data = pd.concat([data_one_pre, data_two_pre]).sample(frac=1, random_state=42).reset_index(drop=True)

    return complete_data

In [37]:
# reading Sugar AI Toxicity Dataset
df = pd.read_csv('../data/secondarydatasets/SugarAI/toxicity_en.csv')

#### Preprocessing the Sugar AI Toxicity Dataset first: https://github.com/surge-ai/toxicity/blob/main/toxicity_en.csv

In [38]:
df['is_toxic'] = df['is_toxic'].apply(lambda value: 1 if value == 'Toxic' else 0)
df

Unnamed: 0,text,is_toxic
0,"Elon Musk is a piece of shit, greedy capitalis...",1
1,The senile credit card shrill from Delaware ne...,1
2,He does that a lot -- makes everyone look good...,1
3,F*ck Lizzo,1
4,Epstein and trump were best buds!!! Pedophiles...,1
...,...,...
995,My maternal abuelita taught me how to make pla...,0
996,Funnily enough I was looking online last week ...,0
997,I can't bear how nice this is.\n \n I guess it...,0
998,Going to buy a share of Tesla just to ensure i...,0


In [39]:
# Sanity check for checking for equal values (Toxic values are longer by one)
print(f"Length of Toxic values sanity check within SugarAI dataset: {len(df[df['is_toxic'] == 1])}")
print(f"Length of Non-Toxic values sanity check within SugarAI dataset: {len(df[df['is_toxic'] == 0])}")

Length of Toxic values sanity check within SugarAI dataset: 501
Length of Non-Toxic values sanity check within SugarAI dataset: 499


In [40]:
# creating equal toxic and non-toxic sets fo evaluation
toxic_set = df[df['is_toxic'] == 1].iloc[:499]
non_toxic_set = df[df['is_toxic'] == 0].iloc[:499]

In [41]:
toxic_set

Unnamed: 0,text,is_toxic
0,"Elon Musk is a piece of shit, greedy capitalis...",1
1,The senile credit card shrill from Delaware ne...,1
2,He does that a lot -- makes everyone look good...,1
3,F*ck Lizzo,1
4,Epstein and trump were best buds!!! Pedophiles...,1
...,...,...
494,We need religion tho...to control the dumbfuck...,1
495,Fucking people are nuts. That's nearly as bad ...,1
496,By now the Americans knew that the roots cause...,1
497,For fentanyl floyed,1


In [42]:
non_toxic_set

Unnamed: 0,text,is_toxic
501,"They’re shallots, actually",0
502,me thinking in my head: mmm pizzaaaa...,0
503,I have never seen a dice hammer that is so fuc...,0
504,Got a question for you guys out there: did rev...,0
505,This sub’s notorious for it. It might be the m...,0
...,...,...
995,My maternal abuelita taught me how to make pla...,0
996,Funnily enough I was looking online last week ...,0
997,I can't bear how nice this is.\n \n I guess it...,0
998,Going to buy a share of Tesla just to ensure i...,0


In [43]:
# combining toxic and non-toxic sets through randomized process
full_evaluation_set = combined_randomized_preprocessing(toxic_set, non_toxic_set)

In [44]:
# saves preprocessed data for late usage
full_evaluation_set.to_csv('../data/csv/sugar_ai_toxicity_evaluation_set.csv', index=False)