In [61]:
import json
import re
import math
import pandas as pd
from algorithms import *
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

In [73]:
def combined_randomized_preprocessing(data_one, data_two):
    data_one_pre = unintended_bias_preprocess(data_one)
    data_two_pre = unintended_bias_preprocess(data_two)

    complete_data = pd.concat([data_one_pre, data_two_pre]).sample(frac=1, random_state=42).reset_index(drop=True)
    print(complete_data['Text'].isna().any())
    return complete_data

#### Necessary pip installation commands include:
- pip install pandas
- pip install nltk
#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key


In [63]:
# reading of Jigsaw Unintended Bias in Toxicity Classification datasets
jigsaw_unintended_train = pd.read_csv('../data/secondarydatasets/UnintendedBiasInToxicityClassification/train.csv')
jigsaw_unintended_inidividual_annotations = pd.read_csv('../data/secondarydatasets/UnintendedBiasInToxicityClassification/toxicity_individual_annotations.csv')

#### Preprocessing the Jigsaw Unintended Bias in Toxicity Classification dataset first: https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification

In [64]:
# merges datasets by 'id' column
merged_data = pd.merge(jigsaw_unintended_train, jigsaw_unintended_inidividual_annotations, on='id', how='inner')

In [65]:
merged_data

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene_x,identity_attack_x,insult_x,threat_x,asian,atheist,...,identity_annotator_count,toxicity_annotator_count,worker,toxic,severe_toxic,identity_attack_y,insult_y,obscene_y,sexual_explicit_y,threat_y
0,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,3107,0,0,0,0,0,0,0
1,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,5418,0,0,0,0,0,0,0
2,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,1667,0,0,0,0,0,0,0
3,59848,0.000000,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,5094,0,0,0,0,0,0,0
4,59849,0.000000,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,144,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15855261,6334009,0.621212,Anyone who is quoted as having the following e...,0.030303,0.030303,0.045455,0.621212,0.0,,,...,0,66,740,1,0,0,1,0,0,0
15855262,6334010,0.000000,Students defined as EBD are legally just as di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,5698,0,0,0,0,0,0,0
15855263,6334010,0.000000,Students defined as EBD are legally just as di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,7221,0,0,0,0,0,0,0
15855264,6334010,0.000000,Students defined as EBD are legally just as di...,0.000000,0.000000,0.000000,0.000000,0.0,,,...,0,4,1166,0,0,0,0,0,0,0


In [84]:
evaluation_size = 500
toxic_set = merged_data[merged_data['toxic'] == 1].sample(n=(evaluation_size),random_state=43)
non_toxic_set = merged_data[merged_data['toxic'] == 0].sample(n=(evaluation_size),random_state=43)

In [85]:
# checking length of sets
print(f"Toxic set length: {len(toxic_set)}")
print(f"Non-Toxic set length: {len(non_toxic_set)}")

Toxic set length: 500
Non-Toxic set length: 500


In [86]:
# combining toxic and non-toxic sets through randomized process
full_evaluation_set = combined_randomized_preprocessing(toxic_set, non_toxic_set)

False


In [88]:
# saves preprocessed data for late usage
full_evaluation_set.to_csv('../data/csv/unintended_bias_toxicity_classification_set.csv', index=False)