In [116]:
import pandas as pd

In [117]:
train_1 = pd.read_csv("../dataset/unprocessed/jigsaw-toxic-comment-classification-challenge/train.csv")
train_2 = pd.read_csv("../dataset/unprocessed/jigsaw-unintended-bias-in-toxicity-classification/train.csv")

In [118]:
test_1 = pd.read_csv("../dataset/unprocessed/jigsaw-toxic-comment-classification-challenge/test.csv")
test_2 = pd.read_csv("../dataset/unprocessed/jigsaw-unintended-bias-in-toxicity-classification/test.csv")

In [119]:
train_1.shape

(159571, 8)

In [120]:
test_1.shape

(153164, 2)

In [121]:
test_2.shape

(97320, 2)

### Add is_toxic feature and remove unnecessary columns from train_1

In [122]:
def label_is_toxic(row):
    if row["toxic"] == 1 or row["severe_toxic"] == 1 or row["obscene"] == 1 or row["threat"] == 1 or row["insult"] == 1 or row["identity_hate"] == 1:
        return 1
    return 0

In [123]:
train_1["is_toxic"] = train_1.apply(label_is_toxic, axis=1)

In [124]:
train_1 = train_1.loc[:, ["id", "comment_text", "is_toxic"]]
train_1.head()

Unnamed: 0,id,comment_text,is_toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


### Add is_toxic feature and remove unnecessary columns from train_2

In [125]:
train_2["is_toxic"] = train_2["target"].apply(lambda x: 1 if x >= 0.5 else 0)

In [126]:
train_2 = train_2.loc[:, ["id", "comment_text", "is_toxic"]]
train_2.head()

Unnamed: 0,id,comment_text,is_toxic
0,59848,"This is so cool. It's like, 'would you want yo...",0
1,59849,Thank you!! This would make my life a lot less...,0
2,59852,This is such an urgent design problem; kudos t...,0
3,59855,Is this something I'll be able to install on m...,0
4,59856,haha you guys are a bunch of losers.,1


### Add train_2 to train_1, test_2 to test_1, and ignore duplicate data

In [127]:
def merge(ds1, ds2):
    # Create a set of all unique comment_texts in ds2
    ds2_comments_set = set(ds2["comment_text"].str.strip())

    # Create a boolean mask for rows in ds2 that should be added to ds1
    mask = ~ds2["comment_text"].str.strip().isin(ds1["comment_text"].str.strip())

    # Filter rows from ds2 that are not in ds1
    rows_to_add = ds2[mask]

    # Concatenate the rows_to_add to ds1
    ds1 = pd.concat([rows_to_add, ds1], ignore_index=True)

    return ds1

In [128]:
train_1 = merge(train_1, train_2)
train_1.shape

(1964294, 3)

In [129]:
test_1 = merge(test_1, test_2)
test_1.shape

(250441, 2)

### update the name of train and test datasets

In [130]:
train = train_1
test = test_1

### Make sure comment_text is lowercase

In [131]:
train.loc[:, "comment_text"] = train["comment_text"].str.lower()
test.loc[:, "comment_text"] = test["comment_text"].str.lower()

### Drop any row that comment_text is empty

In [132]:
train = train.dropna(subset="comment_text", how='any',axis=0)

### Replacing contractions

In [133]:
import contractions

In [134]:
train.loc[:, "comment_text"] = train["comment_text"].apply(lambda x: contractions.fix(str(x)))
test.loc[:, "comment_text"] = test["comment_text"].apply(lambda x: contractions.fix(str(x)))

In [135]:
train.head()

Unnamed: 0,id,comment_text,is_toxic
0,59848,"this is so cool. it is like, 'would you want y...",0
1,59849,thank you!! this would make my life a lot less...,0
2,59852,this is such an urgent design problem; kudos t...,0
3,59855,is this something i will be able to install on...,0
4,59856,haha you guys are a bunch of losers.,1


### Removing URLs

In [136]:
import re

In [137]:
def remove_URL(text):
    """
        Remove URLs from a sample string
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)

In [138]:
train.loc[:, "comment_text"] = train["comment_text"].apply(lambda x: remove_URL(x))
test.loc[:, "comment_text"] = test["comment_text"].apply(lambda x: remove_URL(x))

### Removing HTML Tags

In [139]:
def remove_html(text):
    """
        Remove the html in sample text
    """
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)

In [140]:
train.loc[:, "comment_text"] = train["comment_text"].apply(lambda x: remove_html(x))
test.loc[:, "comment_text"] = test["comment_text"].apply(lambda x: remove_html(x))

### Remove non-ascii characters (since we are only concerned with English, and basic 

In [141]:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters
    """
    return re.sub(r'[^\x00-\x7f]',r'', text)

In [142]:
train.loc[:, "comment_text"] = train["comment_text"].apply(lambda x: remove_non_ascii(x))
test.loc[:, "comment_text"] = test["comment_text"].apply(lambda x: remove_non_ascii(x))

In [144]:
train.to_csv("../dataset/processed/train.csv")
test.to_csv("../dataset/processed/test.csv")