In [1]:
import json
import re
import math
import pandas as pd
from algorithms import *
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

In [2]:
def combined_randomized_preprocessing(data_one, data_two):
    data_one_pre = unintended_bias_preprocess(data_one)
    data_two_pre = unintended_bias_preprocess(data_two)

    complete_data = pd.concat([data_one_pre, data_two_pre]).sample(frac=1, random_state=42).reset_index(drop=True)
    print(complete_data['Text'].isna().any())
    return complete_data

#### Necessary pip installation commands include:
- pip install pandas
- pip install nltk
#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key


In [3]:
# reading of Jigsaw Unintended Bias in Toxicity Classification datasets
jigsaw_unintended_train = pd.read_csv('../data/secondarydatasets/UnintendedBiasInToxicityClassification/train.csv')
jigsaw_unintended_inidividual_annotations = pd.read_csv('../data/secondarydatasets/UnintendedBiasInToxicityClassification/toxicity_individual_annotations.csv')

#### Preprocessing the Jigsaw Unintended Bias in Toxicity Classification dataset first: https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification

In [4]:
# merges datasets by 'id' column
merged_data = pd.merge(jigsaw_unintended_train, jigsaw_unintended_inidividual_annotations, on='id', how='inner')

In [5]:
merged_data

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene_x,identity_attack_x,insult_x,threat_x,asian,atheist,...,identity_annotator_count,toxicity_annotator_count,worker,toxic,severe_toxic,identity_attack_y,insult_y,obscene_y,sexual_explicit_y,threat_y
0,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,3107,0,0,0,0,0,0,0
1,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,5418,0,0,0,0,0,0,0
2,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,1667,0,0,0,0,0,0,0
3,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,5094,0,0,0,0,0,0,0
4,59849,0.000000,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,144,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15855261,6334009,0.621212,Anyone who is quoted as having the following e...,0.030303,0.030303,0.045455,0.621212,0.0,,,...,0,66,740,1,0,0,1,0,0,0
15855262,6334010,0.000000,Students defined as EBD are legally just as di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,5698,0,0,0,0,0,0,0
15855263,6334010,0.000000,Students defined as EBD are legally just as di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,7221,0,0,0,0,0,0,0
15855264,6334010,0.000000,Students defined as EBD are legally just as di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,1166,0,0,0,0,0,0,0


In [6]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15855266 entries, 0 to 15855265
Data columns (total 53 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   id                                   int64  
 1   target                               float64
 2   comment_text                         object 
 3   severe_toxicity                      float64
 4   obscene_x                            float64
 5   identity_attack_x                    float64
 6   insult_x                             float64
 7   threat_x                             float64
 8   asian                                float64
 9   atheist                              float64
 10  bisexual                             float64
 11  black                                float64
 12  buddhist                             float64
 13  christian                            float64
 14  female                               float64
 15  heterosexual                  

In [7]:
len(merged_data[merged_data['toxic'] == 1])

5294254

In [8]:
len(merged_data[merged_data['toxic'] == 0])

10561012

In [9]:
complete = merged_data.drop_duplicates(subset=['comment_text'])
complete

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene_x,identity_attack_x,insult_x,threat_x,asian,atheist,...,identity_annotator_count,toxicity_annotator_count,worker,toxic,severe_toxic,identity_attack_y,insult_y,obscene_y,sexual_explicit_y,threat_y
0,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,3107,0,0,0,0,0,0,0
4,59849,0.000000,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,144,0,0,0,0,0,0,0
8,59852,0.000000,This is such an urgent design problem; kudos t...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,801,0,0,0,0,0,0,0
12,59855,0.000000,Is this something I'll be able to install on m...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,7664,0,0,0,0,0,0,0
16,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.000000,0.021277,0.872340,0.0,0.0,0.0,...,4,47,1019,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15855184,6333967,0.000000,"Maybe the tax on ""things"" would be collected w...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,4020,0,0,0,0,0,0,0
15855188,6333969,0.000000,What do you call people who STILL think the di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,381,0,0,0,0,0,0,0
15855192,6333982,0.000000,"thank you ,,,right or wrong,,, i am following ...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,757,0,0,0,0,0,0,0
15855196,6334009,0.621212,Anyone who is quoted as having the following e...,0.030303,0.030303,0.045455,0.621212,0.0,,,...,0,66,266,1,0,0,1,0,0,0


In [10]:
len(complete[complete['toxic'] == 1])

145431

In [11]:
len(complete[complete['toxic'] == 0])

1635392

In [12]:
evaluation_size = 500
toxic_set = complete[complete['toxic'] == 1].sample(n=(evaluation_size),random_state=43)
non_toxic_set = complete[complete['toxic'] == 0].sample(n=(evaluation_size),random_state=43)

In [13]:
# checking length of sets
print(f"Toxic set length: {len(toxic_set)}")
print(f"Non-Toxic set length: {len(non_toxic_set)}")

Toxic set length: 500
Non-Toxic set length: 500


In [14]:
toxic_set[toxic_set['toxic'] == 1]

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene_x,identity_attack_x,insult_x,threat_x,asian,atheist,...,identity_annotator_count,toxicity_annotator_count,worker,toxic,severe_toxic,identity_attack_y,insult_y,obscene_y,sexual_explicit_y,threat_y
5364444,989940,0.600000,We'll have a Civil war in this country if they...,0.100000,0.200000,0.1,0.600000,0.1000,,,...,0,10,626,1,0,1,1,0,0,0
2610280,601661,0.800000,"Damn, he likes us and we like him.\n\nEveryone...",0.100000,0.900000,0.0,0.000000,0.0000,,,...,0,10,2105,1,0,0,0,1,0,0
7398345,5171493,0.725000,Marine Le Pen was born fascist. She will die ...,0.050000,0.025000,0.1,0.475000,0.3125,,,...,0,80,280,1,0,0,1,1,0,0
6774665,5080933,0.500000,Canadian version of Putinism.,0.000000,0.100000,0.4,0.200000,0.1000,,,...,0,10,5208,1,0,1,0,0,0,0
9325320,5448020,0.100000,"""Maybe governments should only spend money the...",0.000000,0.000000,0.0,0.200000,0.0000,0.0,0.0,...,4,10,353,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11864761,5800082,0.700000,but I thought this was about children. Does bu...,0.000000,0.000000,0.5,0.400000,0.5000,0.0,0.0,...,10,10,1399,1,0,1,0,0,0,1
5794157,1054310,0.200000,I suspect Trump will take a serious look at so...,0.000000,0.100000,0.0,0.200000,0.0000,,,...,0,10,830,1,0,0,1,0,0,0
9042065,5404335,0.166667,Was that intended to be sarcastic? Preet Bhar...,0.000000,0.000000,0.0,0.166667,0.0000,,,...,0,6,1731,1,0,0,1,0,0,0
7244719,5149263,0.600000,"Nobody likes taxes, but it's good to know that...",0.000000,0.200000,0.0,0.500000,0.0000,,,...,0,10,3053,1,0,0,0,1,0,0


In [15]:
non_toxic_set[non_toxic_set['toxic'] == 0]

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene_x,identity_attack_x,insult_x,threat_x,asian,atheist,...,identity_annotator_count,toxicity_annotator_count,worker,toxic,severe_toxic,identity_attack_y,insult_y,obscene_y,sexual_explicit_y,threat_y
13018271,5957153,0.0,"I largely agree with what you say, but the sit...",0.0,0.0,0.0,0.0,0.0,,,...,0,4,7848,0,0,0,0,0,0,0
15461001,6280264,0.0,So you don't agree with the research. \nCan yo...,0.0,0.0,0.0,0.0,0.0,,,...,0,4,7958,0,0,0,0,0,0,0
5807597,1056171,0.0,"No Donald, pretty much only you failed to reco...",0.0,0.0,0.0,0.0,0.0,,,...,0,4,7800,0,0,0,0,0,0,0
2115748,534483,0.0,But no real rebuttal. \nSure. If Trump was mo...,0.0,0.0,0.0,0.0,0.0,,,...,0,4,740,0,0,0,0,0,0,0
6034174,4967625,0.0,Apparently 70's style Soviet communal housing ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,4,642,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15419717,6275150,0.0,"SS is not a tax, its more like a gov 401k.",0.0,0.0,0.0,0.0,0.0,,,...,0,4,957,0,0,0,0,0,0,0
9511437,5474571,0.0,If you hover over the link that is the referen...,0.0,0.0,0.0,0.0,0.0,,,...,0,4,7373,0,0,0,0,0,0,0
5912652,1071467,0.0,"Why am I thinking of Liberation Theology's ""ba...",0.0,0.0,0.0,0.0,0.0,,,...,0,4,6735,0,0,0,0,0,0,0
10187419,5566234,0.0,"If it's to be a woman, Susan Bonner. Men will ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,4,7254,0,0,0,0,0,0,0


In [12]:
# combining toxic and non-toxic sets through randomized process
full_evaluation_set = combined_randomized_preprocessing(toxic_set, non_toxic_set)

False


In [13]:
# saves preprocessed data for late usage
full_evaluation_set.to_csv('../data/csv/unintended_bias_toxicity_classification_set.csv', index=False)