# Data Cleaning: Exploratory Analysis

In [1]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

### Load Data

In [2]:
# Gabe Hate Corpus
fp: str = "../data/ghc_data/**/*.tsv"
ghc_data: pl.DataFrame = pl.read_csv(source=fp, separator="\t")

ghc_data.head()

text,hd,cv,vo
str,i64,i64,i64
"""https://www.yo…",0,0,0
"""Very nice! I t…",0,0,0
"""Watch today. h…",0,0,0
""""" Thinking Ven…",0,0,0
"""What about dea…",0,0,0


In [3]:
# Add target label
ghc_data = ghc_data.with_columns(
    hateful_or_vulgar_or_cyberbully=pl.any_horizontal("hd", "cv", "vo")
)
ghc_data.group_by("hateful_or_vulgar_or_cyberbully").agg(pl.len())

hateful_or_vulgar_or_cyberbully,len
bool,u32
True,3377
False,24169


In [4]:
ghc_data.sample(n=20, seed=3).to_pandas()

Unnamed: 0,text,hd,cv,vo,hateful_or_vulgar_or_cyberbully
0,"It's Jordanetics Week at the Daily Meme Wars, where we'll be applying Dr. Jordan Peterson's anti-nationalist philosophy of extreme atomic individualism to some of the great moments in history. Today, we have General Peterson's advice on how to win a war.",0,0,0,False
1,"That depends what you mean by a ""bad"" joke. If it just falls flat, that's in the ear of the listener. If you think it's inherently blasphemous, I disagree. I told it to a captain of the sodality at St. Thomas the Apostle parish this morning, and she thought it was funny.",0,0,0,False
2,How about a Hyperloop instead?,0,0,0,False
3,Stein Mart cardholders could sign on to their credit card account online and also handle their - http://www.clipsit.net/www-steinmartcredit-com/,0,0,0,False
4,"A homosexual character hits on straight guy and then wonders why straights are mad at him, there is full on sex with the sea monster, sadistic chained torture, many abusive white males and of course evil Russians. Total NWO propaganda but filmed exquisitely.",1,0,0,True
5,"Swedish Activist: ""Everyone's Afraid To Be Branded A Racist"" | Zero Hedge https://www.zerohedge.com/news/2018-04-23/swedish-activist-everyones-afraid-be-branded-racist",0,0,0,False
6,Let's hope Conrad Black is right.,0,0,0,False
7,"I had a lot of fun with her, but she ended up leaving 🙁. I would have loved to keep her, but I didn't know if she was a stray of if she was someone's pet. My cat also probably wouldn't have liked her too. Oh well, I had a lot of fun with Q'yle while she stayed.",0,0,0,False
8,"shit is getting real right now, peoples are being murdered left, right and center.",0,0,1,True
9,"When it comes to wearing a leather jacket, many people feel at a loss or self-conscious in the - https://www.independencebrothers.com/blog/layering-with-a-jacket #leather_jacket #fashion #jacket #style",0,0,0,False


## To Do

-  Remove punctuations.
-  Normalize (lowercase) data.
-  Remove stop words.
-  Remove non letter characters (numbers, symbols, emojis, etc).
-  Remove URLs.
-  Remove hashtags.
-  Remove mentions and usernames.
-  Remove HTML tags.
-  Implement spell correction.

In [5]:
ghc_data = ghc_data.select(["text", "hateful_or_vulgar_or_cyberbully"])
ghc_data.head()

text,hateful_or_vulgar_or_cyberbully
str,bool
"""https://www.yo…",False
"""Very nice! I t…",False
"""Watch today. h…",False
""""" Thinking Ven…",False
"""What about dea…",False


## Toxic Comments Classification Data

In [6]:
test_fp: str = "../data/toxic_comment_data/test_labels.csv"
toxic_comments_test_labels: pl.DataFrame = pl.read_csv(source=test_fp)
toxic_comments_test_labels.head()

id,toxic,severe_toxic,obscene,threat,insult,identity_hate
str,i64,i64,i64,i64,i64,i64
"""00001cee341fdb…",-1,-1,-1,-1,-1,-1
"""0000247867823e…",-1,-1,-1,-1,-1,-1
"""00013b17ad220c…",-1,-1,-1,-1,-1,-1
"""00017563c3f791…",-1,-1,-1,-1,-1,-1
"""00017695ad8997…",-1,-1,-1,-1,-1,-1


### Comment

- From the data dict, `-1` means it was **NOT** used for scoring/grading the performance of the model.
- I'll drop rows with `-1` labels.

In [7]:
toxic_comments_test_labels = toxic_comments_test_labels.with_columns(
    contains_scoring_data=pl.mean_horizontal(
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ).round(2)
)
toxic_comments_test_labels.group_by("contains_scoring_data").agg(pl.len())

contains_scoring_data,len
f64,u32
0.17,1842
0.5,2081
0.67,611
0.0,57735
-1.0,89186
0.83,165
1.0,14
0.33,1530


In [8]:
toxic_comments_test_labels.head()

id,toxic,severe_toxic,obscene,threat,insult,identity_hate,contains_scoring_data
str,i64,i64,i64,i64,i64,i64,f64
"""00001cee341fdb…",-1,-1,-1,-1,-1,-1,-1.0
"""0000247867823e…",-1,-1,-1,-1,-1,-1,-1.0
"""00013b17ad220c…",-1,-1,-1,-1,-1,-1,-1.0
"""00017563c3f791…",-1,-1,-1,-1,-1,-1,-1.0
"""00017695ad8997…",-1,-1,-1,-1,-1,-1,-1.0


In [9]:
toxic_comments_test_labels.filter(pl.col("contains_scoring_data").eq(0.17))

id,toxic,severe_toxic,obscene,threat,insult,identity_hate,contains_scoring_data
str,i64,i64,i64,i64,i64,i64,f64
"""00091c35fa9d04…",1,0,0,0,0,0,0.17
"""001d739c97bc2a…",1,0,0,0,0,0,0.17
"""005f47397e07e1…",1,0,0,0,0,0,0.17
"""00b3813b966af7…",1,0,0,0,0,0,0.17
"""00bd66c9ef023f…",1,0,0,0,0,0,0.17
…,…,…,…,…,…,…,…
"""feb3e338da2c94…",1,0,0,0,0,0,0.17
"""ff2b1797bd5532…",1,0,0,0,0,0,0.17
"""ff2b94f0258d8e…",1,0,0,0,0,0,0.17
"""ffd49b8defd069…",0,0,0,0,1,0,0.17


In [10]:
# Drop the labels with `-1`
toxic_comments_test_labels = toxic_comments_test_labels.filter(
    pl.col("contains_scoring_data").ne(-1.0)
)
toxic_comments_test_labels.sample(n=10)

id,toxic,severe_toxic,obscene,threat,insult,identity_hate,contains_scoring_data
str,i64,i64,i64,i64,i64,i64,f64
"""1026dc2f9e5424…",1,1,1,1,1,0,0.83
"""7f610c7c7609db…",0,0,0,0,0,0,0.0
"""8573f801fef780…",0,0,0,0,0,0,0.0
"""a7e038cf7f2459…",0,0,0,0,0,0,0.0
"""1f1ae7e624b2e7…",0,0,0,0,0,0,0.0
"""2fe07ae742f220…",0,0,0,0,0,0,0.0
"""da3a88c7bd16bc…",0,0,0,0,0,0,0.0
"""e1951a110f0d41…",0,0,0,0,0,0,0.0
"""176a3211904adf…",0,0,0,0,0,0,0.0
"""59a2c48630fdf3…",0,0,0,0,0,0,0.0


In [11]:
toxic_comments_test_labels.group_by("contains_scoring_data").agg(pl.len())

contains_scoring_data,len
f64,u32
0.67,611
0.0,57735
0.5,2081
0.33,1530
0.83,165
1.0,14
0.17,1842


### Comment

- `contains_scoring_data`=0.0 means NOT hateful/vulgar.
- `contains_scoring_data`!=0.0 means hateful/vulgar.

In [12]:
toxic_comments_test_labels = toxic_comments_test_labels.with_columns(
    hateful_or_vulgar_or_cyberbully=pl.when(pl.col("contains_scoring_data").eq(0.0))
    .then(pl.lit(False))
    .otherwise(pl.lit(True))
).select(["id", "hateful_or_vulgar_or_cyberbully"])

toxic_comments_test_labels.sample(n=10, seed=1)

id,hateful_or_vulgar_or_cyberbully
str,bool
"""b5db51ef0db2da…",False
"""09fbcd08da930e…",False
"""2e7d4ad66d8d3c…",False
"""574782f0c5334a…",False
"""7967ef54886d45…",True
"""f50da7f970b6bf…",True
"""1c93d9179e4739…",False
"""68bc1bec1bb779…",False
"""4a2fbf97936421…",False
"""b629995dd0f0f0…",True


In [13]:
fp: str = "../data/toxic_comment_data/test.csv"
toxic_comments_test_data: pl.DataFrame = pl.read_csv(source=fp)
toxic_comments_test_data.head()

id,comment_text
str,str
"""00001cee341fdb…","""Yo bitch Ja Ru…"
"""0000247867823e…","""== From RfC ==…"
"""00013b17ad220c…",""""" == Source…"
"""00017563c3f791…",""":If you have a…"
"""00017695ad8997…","""I don't anonym…"


In [14]:
# Merge the data
toxic_comments_test_data = toxic_comments_test_data.join(
    toxic_comments_test_labels, on="id"
)
toxic_comments_test_data.sample(n=10, seed=5).to_pandas()

Unnamed: 0,id,comment_text,hateful_or_vulgar_or_cyberbully
0,bab1b5c4286c7364,""" \n {| class=""""messagebox protected"""" style=""""border: 1px solid #8888aa; padding: 0px; font-size:9pt;"""" \n |align=""""center""""| \n |align=""""left""""| This user or user talk page has been temporarily protected from editing by new or unregistered users to prevent blocked users from removing warnings or introducing vandalism to it. (Administrators, please remember to add the user to Wikipedia:List of protected pages/User pages.) \n |} \n\n """,False
1,b597d458feea1b38,. One of the books is in English and the other directly claims that the translation 'Triglav cap' can be used,False
2,56ba22742fb29153,Pies taste like butt holes!!!,False
3,9d2133158807b8f8,"Filll]] 05:32, 26 December 2006",False
4,5b509d4b41475d50,fucks you in your ear pussies,True
5,bfbd18e2b7189192,== hey == \n\n what the fuck is your problem nigga?,True
6,8645bcd76ca16f24,== Dennis Bayens == \n\n Likes to have ass sex.. \n\n \n with men.,True
7,bd26f3dc84c75530,"== Copyright tagging == \n\n When you add a db-copyvio, please indicate what the text is a copy of. If there is no url that you can use with the URL parameter, please use the edit summary, a note on the discussion page, or something. Thank you, |",False
8,c1d2d6a2bd5e9e84,""" March 2015 (UTC) \n ::::::Very cluttered? We currently show five duals. The worst case, then, is five additional rows, or a 25% increase, somewhat lower if you consider caption and heading. The way to prevent mistakes is to be careful, and any mistakes would be correctable in any case. ― 00:03, 27""",False
9,5d2861330fe88700,== ya what son == \n\n ya well owning with meteor shower is way more important than stupid greek history biatch,True


In [15]:
# Train data
fp: str = "../data/toxic_comment_data/train.csv"
toxic_comments_train_data: pl.DataFrame = pl.read_csv(source=fp)

toxic_comments_train_data.head()

id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
str,str,i64,i64,i64,i64,i64,i64
"""0000997932d777…","""Explanation Wh…",0,0,0,0,0,0
"""000103f0d9cfb6…","""D'aww! He matc…",0,0,0,0,0,0
"""000113f07ec002…","""Hey man, I'm r…",0,0,0,0,0,0
"""0001b41b1c6bb3…",""""" More I can't…",0,0,0,0,0,0
"""0001d958c54c6e…","""You, sir, are …",0,0,0,0,0,0


In [16]:
toxic_comments_train_data = toxic_comments_train_data.with_columns(
    hateful_or_vulgar_or_cyberbully=pl.mean_horizontal(
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ).round(2)
)
toxic_comments_train_data.group_by("hateful_or_vulgar_or_cyberbully").agg(pl.len())

hateful_or_vulgar_or_cyberbully,len
f64,u32
0.67,1760
0.0,143346
0.17,6360
0.5,4209
0.83,385
0.33,3480
1.0,31


In [17]:
toxic_comments_train_data = toxic_comments_train_data.with_columns(
    hateful_or_vulgar_or_cyberbully=pl.when(
        pl.col("hateful_or_vulgar_or_cyberbully").eq(0.0)
    )
    .then(pl.lit(False))
    .otherwise(pl.lit(True))
).select(["id", "comment_text", "hateful_or_vulgar_or_cyberbully"])

toxic_comments_train_data.sample(n=10, seed=1).to_pandas()

Unnamed: 0,id,comment_text,hateful_or_vulgar_or_cyberbully
0,107038eee0ac312c,OGB \n\nHi. I just wanted to let you know I've got plenty of non-chess edits in my history. Just check. Thanks.,False
1,cb33bcbaf63bc0a9,"BISE \n\nLemonMonday, I thought I would repeat here what I have already said to you on other article talk pages. As far as I am concerned the WP:BISE project is dead. I was the last person to post there on 6 February and before that it was the 3 December 2010 that there was any activity. There are many item there which have not been actioned and both the Admin's that were involved have disappeared. For my part I will continue to abide by the general WP guidelines and if I see articles that uses the term BI incorrectly I will discuss on the talk page and look at resolving the issue within W...",False
2,94c3722172bdd1a8,"should unless someone can provide additional citation. \n\nThe baboon incident, while no one seems to claim it didn't happen, may be classed as WP:FRINGE and hence non-encyclopedic example due to the fact that it was not published in a scholarly journal (notwithstanding the researcher's reputation for unethical behavior)",False
3,2ed0945470fc151e,By the way Collectonian - I'm a HE not a SHE. If it makes any difference at this point.,False
4,af8388e11b823826,"""\n\nBridget Marquardt being divorced\nAfternoon,\nU need a DIRECT link IN THE ARTICLE not on the talk page. \nPlus when u click on the 1st link 1 is taken 2 a page where 1 must make a choice; how does 1 know whether or not 2 choose m/c/p/r or w?\nMaybe if u know how 2 access the info, take a screenshot & upload it to the web, then link that. \n""""snarkpit"""" I dont think that is a reliable source. 70.108.133.72 """,False
5,f93be8c67167fcfa,"""\n\n Little Tidbit of information needs citation \n\n""""The album's third single will be released after the album's release and Gaga has stated she wants fans to choose the third single. However, Gaga herself would like to see the next single to be 'Marry the Night.'""""\n\nIs there any confirmation of this? 70.131.151.38 """,False
6,cbf514a48998575f,"(The dialog would not be modal in the Raskin's sense while the user is dragging the window, for example; but that is a marginal case).",False
7,4d50db1fa0d90de8,"Stop my Vandalizing??? \n\nI don't think so, stay out of my damn business you shithead or I'll kick your ass!!!",True
8,4b410f8634d4f3ae,th 2005\nArchive 3 — November 22nd 2005 - December 29,False
9,9af4f7245107f90d,"""\n\n Matrixism \n\nRemoved Matrixism from this list as it does not appear to be a """"fictional"""" religion. """,False


In [18]:
# Concatenate the data
toxic_comments_data: pl.DataFrame = pl.concat(
    [toxic_comments_train_data, toxic_comments_test_data], how="vertical"
)

toxic_comments_data.head()

id,comment_text,hateful_or_vulgar_or_cyberbully
str,str,bool
"""0000997932d777…","""Explanation Wh…",False
"""000103f0d9cfb6…","""D'aww! He matc…",False
"""000113f07ec002…","""Hey man, I'm r…",False
"""0001b41b1c6bb3…",""""" More I can't…",False
"""0001d958c54c6e…","""You, sir, are …",False


### Cyberbullying Data

In [19]:
fp: str = "../data/cyberbully_data/cyberbullying_tweets.csv"
cyberbullying_data: pl.DataFrame = pl.read_csv(source=fp)

cyberbullying_data

tweet_text,cyberbullying_type
str,str
"""In other words…","""not_cyberbully…"
"""Why is #aussie…","""not_cyberbully…"
"""@XochitlSuckkk…","""not_cyberbully…"
"""@Jason_Gio meh…","""not_cyberbully…"
"""@RudhoeEnglish…","""not_cyberbully…"
…,…
"""Black ppl aren…","""ethnicity"""
"""Turner did not…","""ethnicity"""
"""I swear to God…","""ethnicity"""
"""Yea fuck you R…","""ethnicity"""


In [23]:
cyberbullying_data.group_by("cyberbullying_type").agg(pl.len()).to_pandas()

Unnamed: 0,cyberbullying_type,len
0,other_cyberbullying,7823
1,not_cyberbullying,7945
2,religion,7998
3,age,7992
4,gender,7973
5,ethnicity,7961


In [21]:
cyberbullying_data.filter(pl.col("cyberbullying_type").eq("religion")).to_pandas()

Unnamed: 0,tweet_text,cyberbullying_type
0,"Sudeep, did she invite him though? No right? Why are you getting worded up? You're okay with Parvesh Verma cause he speaks against Muslims but against an idiot like Imam because he called for chakka jam?",religion
1,@discerningmumin Islam has never been a resistance to oppression. It has always been source of oppression to both believers and non believer,religion
2,"Boy, your comment about Journalists wanting to keep churches closed is beneath you. As a Christian woman and human being your bosses filth is brushing off on you. Not at all unbiased and a down right lie. SHAME ON YOU.",religion
3,@ShashiTharoor @INCIndia Hindus were and are getting killed by Muslims terriorists in Kashmir. Congress mukt bharat will certainly happen if congressmen like you don’t change their idealogies and keep sounding like idiots. You actually don’t need enemies.,religion
4,"White supremicists? How many do you know? There a few idiots in all races. Where is anti-semitism coming from? Dems, BLM, Antifa, Muslims. You won’t appease them by throwing white supremacy. They hate you &amp; want to destroy Israel &amp; all Jews &amp; you know it.",religion
...,...,...
7993,Can you imagine if Christians came together like that 5 times a day? As long as New Yorkers continue to vote Corrupt Dirtbags into office this situation will only get worse.Ppl need to realize Radical Muslims dont give a damn about Americans regardless of which party you support.,religion
7994,So how to support justice from the initial problem? It morphed into all that and became radical. People and Christians want to support legitimate justice.,religion
7995,RT @TRobinsonNewEra: If you harbour any doubts about what % Muslims believe sharia note this data 36%/310.5 million nontrivial http://t.co/…,religion
7996,"@dankmtl @PeaceNotHate_ One thing about Muslims, they want to exterminate everyone who is not a Muslim. They are doing it around the world.",religion


In [24]:
cyberbullying_data.filter(
    pl.col("cyberbullying_type").eq("not_cyberbullying")
).to_pandas()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was crapilicious! #mkr",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImACelebrityAU #today #sunrise #studio10 #Neighbours #WonderlandTen #etc,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red velvet cupcakes?,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, but not too concerned about another angry dude on twitter.",not_cyberbullying
4,"@RudhoeEnglish This is an ISIS account pretending to be a Kurdish account. Like Islam, it is all lies.",not_cyberbullying
...,...,...
7940,I don't know what I want to wear#ugh,not_cyberbullying
7941,Argh another round of instant restaurants....over it!!!! #mkr,not_cyberbullying
7942,Teacher sets up new charity to tackle anti-gay bullying http://t.co/3GCzf1x,not_cyberbullying
7943,"I can barely tolerate Kat and Andre, Katie and Nikki are annoying but I'd rather see them than Kat and Andre #mkr",not_cyberbullying


In [26]:
cyberbullying_data = cyberbullying_data.with_columns(
    hateful_or_vulgar_or_cyberbully=pl.when(
        pl.col("cyberbullying_type").eq("not_cyberbullying")
    )
    .then(pl.lit(False))
    .otherwise(pl.lit(True))
).select(["tweet_text", "cyberbullying_type"])

cyberbullying_data.sample(n=10, seed=1).to_pandas()

Unnamed: 0,tweet_text,cyberbullying_type
0,Bitches Like there was this girl in high school who bullied me and then we became moms at the same time so now she thinks we can be friends like go away lol,age
1,"@cortanav yo, you're enforcing at PAX this year, right? can you find a person? https://t.co/uKxe6fVVfc",not_cyberbullying
2,RT @Jean_D_Almeida: Im not sexist but its just amazing when a woman knows how to cook! #lovrmesomefood,gender
3,@ZakaWaqar kaash kbhi ksi basic rights k lye is generation ko D Chowk pe laya hota... Shame on you idiot. People like you are curse on the Muslim nation. #ChickenDinnerAtDChowk,religion
4,Minnesota this is what happens maybe you idiots that vote liberal will understand now and you Voted a Muslim radical in the the House of Representatives you get what's coming to you!,religion
5,Fucking moral relativist bullshit. Fucking weak ass nigger culture. Fucking Jew ass ruling class. Fucking dumb sedated docile life style. Fucking demon pawns everywhere. Fuck,ethnicity
6,"Black. White. Hispanic. Muslim. Christian. Woman. Man. ... As long as he/she is capable and is doing right by/for people, race or belief don't matter. It must be 200% better than that villain (+ administration) that is in the WH right now. #VoteBidenToSaveAmerica",religion
7,Miley Cyrus makes date rape joke onstage at GAY http://dlvr.it/5f1HTS,gender
8,"My uncle doesn't rape me fucking good for him, well done for being a basic mildly decent human being, want a medal? #Blameonenotall",other_cyberbullying
9,The idiot knows nothing...in Israel all civil ceremonies are legal as with all other modern countries ...if you want an orthodox Jewish or Muslim wedding with another faith An iman or rabbi won’t marry the two however that doesn’t stop a legal civil one from occurring,religion
