# Data Cleaning: Exploratory Analysis

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

# OR
from watermark import watermark


print(watermark(packages="polars,scikit-learn,torch,lightning", python=True))

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.23.0

numpy    : 1.26.4
pandas   : 2.2.2
polars   : 0.20.21
torch    : 2.2.2
lightning: 2.2.2

conda environment: n/a

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.23.0

polars      : 0.20.21
scikit-learn: 1.4.2
torch       : 2.2.2
lightning   : 2.2.2



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
from toxic_classifier.utilities.dataloaders import (
    CyberBullyDataLoader,
    GHCDataLoader,
    ToxicCommentsDataLoader,
)

# Train data
path: str = "../data/toxic_comment_data/train.csv"
labels_path: str = "../data/toxic_comment_data/test_labels.csv"
other_path: str = "../data/toxic_comment_data/test.csv"

path: str = "../data/cyberbully_data/cyberbullying_tweets.csv"

ghc_dataloader: CyberBullyDataLoader = CyberBullyDataLoader(
    path=path,
    # labels_path=labels_path,
    # other_path=other_path,
    separator=",",
    stratify=True,
)

ghc_dataloader.prepare_data()

(shape: (38_153, 2)
 ┌───────────────────────────────────┬─────────────┐
 │ text                              ┆ dataset     │
 │ ---                               ┆ ---         │
 │ str                               ┆ str         │
 ╞═══════════════════════════════════╪═════════════╡
 │ RT @DemonG0d9000: Quotes part 2 … ┆ cyber_bully │
 │ I feel like if I were on #MKR an… ┆ cyber_bully │
 │ RT @Davinci_beau: Shits annoying… ┆ cyber_bully │
 │ Cooking in a library or Disneyla… ┆ cyber_bully │
 │ 0ff my complexion off my quadrup… ┆ cyber_bully │
 │ …                                 ┆ …           │
 │ @Spacekatgal I'd say this is acc… ┆ cyber_bully │
 │ Glad I’m not the only one who pi… ┆ cyber_bully │
 │ Foolish Yoruba Muslim Arakunrin.… ┆ cyber_bully │
 │ You know Kat and Andre are awful… ┆ cyber_bully │
 │ Donald Trump says it's illegal t… ┆ cyber_bully │
 └───────────────────────────────────┴─────────────┘,
 shape: (9_539, 2)
 ┌───────────────────────────────────┬─────────────┐
 │ tex