In [1]:
import json
import re
import math
import pandas as pd
from algorithms import *
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

In [2]:
def combined_randomized_preprocessing(data_one, data_two):
    data_one_pre = unintended_bias_preprocess(data_one)
    data_two_pre = unintended_bias_preprocess(data_two)

    complete_data = pd.concat([data_one_pre, data_two_pre]).sample(frac=1, random_state=42).reset_index(drop=True)
    print(complete_data['Text'].isna().any())
    return complete_data

#### Necessary pip installation commands include:
- pip install pandas
- pip install nltk
#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key


In [3]:
# reading of Jigsaw Unintended Bias in Toxicity Classification datasets
jigsaw_unintended_train = pd.read_csv('../data/secondarydatasets/UnintendedBiasInToxicityClassification/train.csv')
jigsaw_unintended_inidividual_annotations = pd.read_csv('../data/secondarydatasets/UnintendedBiasInToxicityClassification/toxicity_individual_annotations.csv')

#### Preprocessing the Jigsaw Unintended Bias in Toxicity Classification dataset first: https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification

In [4]:
# merges datasets by 'id' column
merged_data = pd.merge(jigsaw_unintended_train, jigsaw_unintended_inidividual_annotations, on='id', how='inner')

In [5]:
merged_data

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene_x,identity_attack_x,insult_x,threat_x,asian,atheist,...,identity_annotator_count,toxicity_annotator_count,worker,toxic,severe_toxic,identity_attack_y,insult_y,obscene_y,sexual_explicit_y,threat_y
0,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,3107,0,0,0,0,0,0,0
1,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,5418,0,0,0,0,0,0,0
2,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,1667,0,0,0,0,0,0,0
3,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,5094,0,0,0,0,0,0,0
4,59849,0.000000,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,144,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15855261,6334009,0.621212,Anyone who is quoted as having the following e...,0.030303,0.030303,0.045455,0.621212,0.0,,,...,0,66,740,1,0,0,1,0,0,0
15855262,6334010,0.000000,Students defined as EBD are legally just as di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,5698,0,0,0,0,0,0,0
15855263,6334010,0.000000,Students defined as EBD are legally just as di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,7221,0,0,0,0,0,0,0
15855264,6334010,0.000000,Students defined as EBD are legally just as di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,1166,0,0,0,0,0,0,0


In [6]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15855266 entries, 0 to 15855265
Data columns (total 53 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   id                                   int64  
 1   target                               float64
 2   comment_text                         object 
 3   severe_toxicity                      float64
 4   obscene_x                            float64
 5   identity_attack_x                    float64
 6   insult_x                             float64
 7   threat_x                             float64
 8   asian                                float64
 9   atheist                              float64
 10  bisexual                             float64
 11  black                                float64
 12  buddhist                             float64
 13  christian                            float64
 14  female                               float64
 15  heterosexual                  

In [7]:
len(merged_data[merged_data['toxic'] == 1])

5294254

In [8]:
len(merged_data[merged_data['toxic'] == 0])

10561012

In [9]:
complete = merged_data.drop_duplicates(subset=['comment_text'])
complete

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene_x,identity_attack_x,insult_x,threat_x,asian,atheist,...,identity_annotator_count,toxicity_annotator_count,worker,toxic,severe_toxic,identity_attack_y,insult_y,obscene_y,sexual_explicit_y,threat_y
0,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,3107,0,0,0,0,0,0,0
4,59849,0.000000,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,144,0,0,0,0,0,0,0
8,59852,0.000000,This is such an urgent design problem; kudos t...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,801,0,0,0,0,0,0,0
12,59855,0.000000,Is this something I'll be able to install on m...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,7664,0,0,0,0,0,0,0
16,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.000000,0.021277,0.872340,0.0,0.0,0.0,...,4,47,1019,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15855184,6333967,0.000000,"Maybe the tax on ""things"" would be collected w...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,4020,0,0,0,0,0,0,0
15855188,6333969,0.000000,What do you call people who STILL think the di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,381,0,0,0,0,0,0,0
15855192,6333982,0.000000,"thank you ,,,right or wrong,,, i am following ...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,757,0,0,0,0,0,0,0
15855196,6334009,0.621212,Anyone who is quoted as having the following e...,0.030303,0.030303,0.045455,0.621212,0.0,,,...,0,66,266,1,0,0,1,0,0,0


In [10]:
len(complete[complete['toxic'] == 1])

145431

In [11]:
len(complete[complete['toxic'] == 0])

1635392

In [12]:
evaluation_size = 500
toxic_set = complete[complete['toxic'] == 1].sample(n=(evaluation_size),random_state=45)
non_toxic_set = complete[complete['toxic'] == 0].sample(n=(evaluation_size),random_state=45)

In [13]:
# checking length of sets
print(f"Toxic set length: {len(toxic_set)}")
print(f"Non-Toxic set length: {len(non_toxic_set)}")

Toxic set length: 500
Non-Toxic set length: 500


In [14]:
toxic_set[toxic_set['toxic'] == 1]

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene_x,identity_attack_x,insult_x,threat_x,asian,atheist,...,identity_annotator_count,toxicity_annotator_count,worker,toxic,severe_toxic,identity_attack_y,insult_y,obscene_y,sexual_explicit_y,threat_y
7634394,5204788,0.4,Military veterans suffering from PTSD should n...,0.1,0.1,0.0,0.3,0.0,,,...,0,10,4339,1,0,0,1,0,0,0
5323137,983220,0.5,you really need to get some help for that tota...,0.0,0.0,0.1,0.5,0.0,,,...,0,10,519,1,0,0,1,0,0,0
2668646,609607,0.2,"As a born and bred (until age 22) Nutmeger, I ...",0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,4,5,17,1,0,1,0,0,0,0
9932396,5532195,0.3,"Gee, Froma, if Obamacare's problems were ""fixa...",0.0,0.0,0.2,0.1,0.0,,,...,0,10,416,1,0,1,0,0,0,0
14661794,6174023,0.8,Funny how she says that Clinton's problems wer...,0.0,0.0,0.0,0.8,0.0,0.0,0.0,...,4,10,2116,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1139623,402369,0.6,"""Billary"" : Two of the most corrupt, self serv...",0.0,0.0,0.0,0.5,0.0,,,...,0,10,192,1,0,0,1,0,0,0
5651757,1033348,0.2,Yes and its those segments of the population t...,0.1,0.3,0.0,0.1,0.0,,,...,0,10,228,1,0,0,0,1,0,0
610002,331548,0.7,"Darn, your last post was immediately deleted, ...",0.0,0.8,0.0,0.4,0.0,,,...,0,10,3668,1,0,0,1,1,0,0
5958583,1078602,0.4,Under Trump Murica has become a global joke.,0.0,0.0,0.0,0.4,0.0,,,...,0,10,642,1,0,0,1,0,0,0


In [15]:
non_toxic_set[non_toxic_set['toxic'] == 0]


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene_x,identity_attack_x,insult_x,threat_x,asian,atheist,...,identity_annotator_count,toxicity_annotator_count,worker,toxic,severe_toxic,identity_attack_y,insult_y,obscene_y,sexual_explicit_y,threat_y
9931271,5532054,0.712500,Republican party FILTHY w super fakey 'christi...,0.025,0.025,0.475000,0.45,0.425,0.0,0.0,...,4,80,244,0,0,0,0,0,0,0
6429405,5029275,0.000000,They are cheaper because they operate illegall...,0.000,0.000,0.000000,0.00,0.000,,,...,0,4,2175,0,0,0,0,0,0,0
8601398,5343001,0.166667,What nation in the world is even close to Amer...,0.000,0.000,0.166667,0.00,0.000,,,...,0,6,6152,0,0,0,0,0,0,0
1139336,402335,0.100000,I can appreciate a person who stands up for wh...,0.000,0.000,0.200000,0.00,0.000,0.0,0.0,...,10,10,815,0,0,0,0,0,0,0
15435032,6277227,0.000000,Your concern should be getting shot by an offi...,0.000,0.000,0.000000,0.00,0.000,0.0,0.0,...,4,4,142,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8174485,5283045,0.000000,https://www.youtube.com/watch?v=7z_ORNy8tFo\n\...,0.000,0.000,0.000000,0.00,0.000,0.0,0.0,...,6,4,2380,0,0,0,0,0,0,0
4137964,816628,0.600000,Bang on!,0.000,0.000,0.000000,0.00,0.600,0.0,0.0,...,4,10,4479,0,0,0,0,0,0,0
13484085,6018365,0.000000,"A bit surprised at you, peterpi - you usually ...",0.000,0.000,0.000000,0.00,0.000,0.0,0.0,...,10,4,859,0,0,0,0,0,0,0
1587177,461806,0.000000,"Overall about 420 applications, I don't believ...",0.000,0.000,0.000000,0.00,0.000,,,...,0,4,4209,0,0,0,0,0,0,0


In [16]:
# combining toxic and non-toxic sets through randomized process
full_evaluation_set = combined_randomized_preprocessing(toxic_set, non_toxic_set)

False


In [17]:
# saves preprocessed data for late usage
full_evaluation_set.to_csv('../data/csv/semi_unintended_bias_toxicity_classification_set.csv', index=False)