In [55]:
from math import nan
from string import punctuation

import pandas as pd
from nltk.corpus import stopwords

In [56]:
ORIGINAL_PATH = "datasets/ruddit_with_text.csv"
CLEAN_PATH = "datasets/cleaned_dataset.csv"

In [57]:
english_stopwords = stopwords.words("english")
original_dataset = pd.read_csv(ORIGINAL_PATH)
original_dataset.head()

Unnamed: 0,post_id,comment_id,txt,url,offensiveness_score
0,42g75o,cza1q49,> The difference in average earnings between m...,https://www.reddit.com/r/changemyview/comments...,-0.083
1,42g75o,cza1wdh,"The myth is that the ""gap"" is entirely based o...",https://www.reddit.com/r/changemyview/comments...,-0.022
2,42g75o,cza23qx,[deleted],https://www.reddit.com/r/changemyview/comments...,0.167
3,42g75o,cza2bw8,The assertion is that women get paid less for ...,https://www.reddit.com/r/changemyview/comments...,-0.146
4,42g75o,cza2iji,You said in the OP that's not what they're mea...,https://www.reddit.com/r/changemyview/comments...,-0.083


In [58]:
def remove_punctuations(comment):
    for punc in punctuation:
        comment = comment.replace(punc, " ")
    return comment


def extract_keywords(comment):
    comment = remove_punctuations(comment)
    comment_words = [word.lower() for word in comment.split(" ") if word.isalpha()]
    comment_keywords = [
        word for word in comment_words if word not in english_stopwords
    ]
    return comment_keywords


def extract_label(offensiveness_score):
    return "offensive" if offensiveness_score > 0 else "not_offensive"

In [59]:
comment_keywords = original_dataset["txt"].map(extract_keywords)

In [60]:
clean_dataset = pd.DataFrame()
clean_dataset["classification"] = original_dataset["offensiveness_score"].map(extract_label)
clean_dataset["keywords"] = [" ".join(keywords) for keywords in comment_keywords]
clean_dataset["keywords"].replace("", nan, inplace=True)
clean_dataset.dropna(subset=["keywords"], inplace=True)
clean_dataset.head()

Unnamed: 0,classification,keywords
0,not_offensive,difference average earnings men women explaine...
1,not_offensive,myth gap entirely based sex person
2,offensive,deleted
3,not_offensive,assertion women get paid less jobs get paid le...
4,not_offensive,said op measuring measuring aggregate entertai...


In [61]:
print(f"from {len(original_dataset)} samples, {len(clean_dataset)} were valid")

from 5838 samples, 5837 were valid


In [62]:
clean_dataset.to_csv(CLEAN_PATH, index=False)