# Initial data exploration and other insights


In [56]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os

In [36]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to D:\Code\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
RAW_DATA_PATH = os.path.join("..", "data/raw/filtered.tsv")
PREPROCESSED_DATA_PATH = os.path.join("..", "data/raw/dataset.csv")

## Data exploration

In [58]:
df = pd.read_csv(RAW_DATA_PATH, delimiter="\t")
print(f"{len(df)=}")
df.head()

len(df)=577777


Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


We want only data with `ref_tox` > `trn_tox`

In [15]:
relevant_data = df[df["ref_tox"] > df["trn_tox"]]
print(f"{len(relevant_data)=}")
relevant_data.head()

len(relevant_data)=319142


Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
5,5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.703185,0.206522,0.950956,0.035846
6,6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.,0.618866,0.230769,0.999492,0.000131
7,7,Maine was very short on black people back then.,there wasn't much black in Maine then.,0.720482,0.1875,0.96368,0.14871
11,11,"So now their spirits are cursed, walking back ...","their souls are cursed, they guard the paths, ...",0.755883,0.013245,0.842509,0.143992
13,13,"Come on, Cal, leave that shit alone.","come on, Cal, put it down.",0.660481,0.27027,0.999637,0.000279


However, when `ref_tox` < `trn_tox`, we can simply swap `reference` and `translation` columns

In [18]:
irrelevant_data = df[df["ref_tox"] < df["trn_tox"]]  # can become relevant!
print(f"{len(irrelevant_data)=}")
irrelevant_data.head()

len(irrelevant_data)=258635


Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


We definitely do not want to have in the final dataset entities, which have almost equal sentences in `reference` and `translation` columns. 

We can detect it using `similarity` and `lenght_diff` columns.

In [25]:
def get_almost_same_data(
    df: pd.DataFrame, similarity_threshold: float, length_diff_threshold: float
):
    almost_same_data = df[
        (df["similarity"] >= similarity_threshold)
        & (df["lenght_diff"] <= length_diff_threshold)
    ]
    print(f"length: {len(almost_same_data)}")
    return almost_same_data

In [26]:
almost_same_data1 = get_almost_same_data(df, 0.9, 0.05)
almost_same_data1.head()

length: 11044


Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
8,8,"Briggs, what the hell's happening?","Briggs, what the hell is going on?",0.920373,0.0,0.159096,0.841071
23,23,I think you are the strangest man I've ever met.,I think you're the weirdest person I've ever met.,0.934353,0.02,0.003785,0.962527
43,43,"I swear to God, the best thing I ever did in m...","I swear to God, the best thing I've ever done ...",0.932305,0.022472,0.999071,0.0009
88,88,What the heck was that all about?,what the fuck was that all about?,0.915779,0.0,0.027907,0.99883
140,140,I'm talking to myself like a nut.,I'm talking to myself like crazy.,0.902628,0.0,0.934245,0.037017


There are still some valid sentences, let us try tighter bounds.

In [32]:
almost_same_data2 = get_almost_same_data(df, 0.94, 0.02)
almost_same_data2.head()

length: 806


Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
1905,1905,Go ahead and pick your poison.,go ahead and take your poison.,0.949265,0.0,0.013163,0.980638
1959,1959,"Oh! So if you'll excuse me, I need to go jump ...","so if you'll excuse me, I have to go jump on m...",0.941725,0.010989,0.960817,0.132105
2358,2358,Don't die!,don't die.,0.944961,0.0,0.022741,0.935058
3107,3107,You shot him!,you shot him.,0.944284,0.0,0.011094,0.947987
4060,4060,"""Arse"", yes.","""butt,"" yes.",0.942148,0.0,0.970369,0.013613


Now it looks better: `reference` and `translation` are almost the same.

## Data preprocessing

In [40]:
from string import punctuation

ENGLISH_STOP_WORDS = set(stopwords.words("english"))
PUNCTUATIONS = set(punctuation)

Let us build new dataset with only useful and cleaned data  

Cleaning includes the following steps:
- Lowercase sentence
- Tokenize sentence
- Remove punctuation
- Remove stop words


In [44]:
def clean_text(text: str) -> list[str]:
    # lowercase
    lowercased = text.lower()

    # tokenize
    tokenized = word_tokenize(lowercased)

    # remove punctuation
    without_punctuation = filter(lambda w: w not in PUNCTUATIONS, tokenized)

    # remove stop words
    cleared = list(filter(lambda w: w not in ENGLISH_STOP_WORDS, without_punctuation))

    return cleared

In [45]:
clean_text("Go ahead and pick your poison.")

['go', 'ahead', 'pick', 'poison']

## Data building

Let us now build new dataset based on the insights from the "Data exploration" section.

New dataset will contain only two columns: `toxic` and `nontoxic`

In [54]:
def remove_almost_same_data(
    df: pd.DataFrame,
    similarity_threshold: float = 0.94,
    length_diff_threshold: float = 0.02,
) -> pd.DataFrame:
    return df[
        (df["similarity"] < similarity_threshold)
        & (df["lenght_diff"] > length_diff_threshold)
    ]


def extract_relevant_data(df: pd.DataFrame) -> pd.DataFrame:
    relevant_data = df[df["ref_tox"] > df["trn_tox"]]
    relevant_data = relevant_data[["reference", "translation"]]
    return relevant_data.rename(columns={"reference": "toxic", "translation": "nontoxic"})


def extract_irrelevant_data(df: pd.DataFrame) -> pd.DataFrame:
    irrelevant_data = df[df["ref_tox"] <= df["trn_tox"]]
    irrelevant_data = irrelevant_data[["reference", "translation"]]
    return irrelevant_data.rename(
        columns={"reference": "nontoxic", "translation": "toxic"}
    )


def build_dataset(df: pd.DataFrame) -> pd.DataFrame:
    clean_df = remove_almost_same_data(df)
    relevant_data = extract_relevant_data(clean_df)
    irrelevant_data = extract_irrelevant_data(clean_df)

    return pd.concat([relevant_data, irrelevant_data])

In [55]:
dataset = build_dataset(df)
print(f"{len(dataset)=}")
dataset.head()

len(dataset)=526410


Unnamed: 0,toxic,nontoxic
5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...
6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.
7,Maine was very short on black people back then.,there wasn't much black in Maine then.
13,"Come on, Cal, leave that shit alone.","come on, Cal, put it down."
21,"That night, Li'l Dice satisfied his thirst to ...","that night, he satisfied his blood lust, and k..."


Save data

In [59]:
dataset.to_csv(PREPROCESSED_DATA_PATH, index=False)