## Data processing

Takes raw data in csvs.
Filters and creates datasets for specific time periods.

In [104]:
from typing import List
import pandas as pd
from gensim.models import Phrases
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
import re

def preprocess(filename=None, start_date=None, end_date=None):
    # get data from csv
    df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])
    # filter by date
    if start_date:
        df = df[df.timestamp >= start_date]
    if end_date:
        df = df[df.timestamp < end_date]

    # remove rows where comment was deleted
    df = df.drop(df[df.body == "[deleted]"].index)
    df = df.drop(df[df.body == "[removed]"].index)

    # remove links, convert to lowercase, tokenize, and remove short/long tokens
    def remove_links_and_simple_preprocess(sentence):
        sentence = re.sub(r'https?:\/\/.*', '', sentence)
        return simple_preprocess(sentence)
    simple_preprocessed = df['body'].astype(str).apply(remove_links_and_simple_preprocess)

    # function to lemmatize each token, based on its part of speech
    lemmatizer = WordNetLemmatizer()
    def lemmatize_token(t: str, pos:str):
        morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
                  'VB':wn.VERB, 'RB':wn.ADV}
        try:
            pos = morphy_tag[pos[:2]]
        except:
            pos = None
        if pos is not None:
            lemma = lemmatizer.lemmatize(t, pos)
        else:
            lemma = lemmatizer.lemmatize(t)
        return lemma
    
    # tokenize comments, preserving common bigram phrases
    # identify common bigram phrases
    phrases = Phrases(simple_preprocessed, scoring="npmi", threshold=0.7)
    def preproccess_sentence(preprocessed_sentence: str) -> List[str]:
        # combine tokens that make up a phrase and drop associated score
        simple_tokens = [t[0] for t in phrases.analyze_sentence(preprocessed_sentence)]
        # lemmatize tokens
        tokens_and_pos = pos_tag(simple_tokens)
        tokens = [lemmatize_token(t, pos) for t, pos in tokens_and_pos]
        return tokens
    
    # create body_clean column: a preprocessed version of body
    df['body_clean'] = simple_preprocessed.apply(preproccess_sentence)
    return df

In [105]:
prefix = "data/raw/"
datasets = {
    "incels": 
        {
            "filename": prefix + "Incels_comments.csv",
            "start_date": "2015-11-07"
        },
    "braincels": 
        {
            "filename": prefix + "Braincels_comments.csv",
            "start_date": "2017-09-30"
        },
    "trufemcels": 
        {
            "filename": prefix + "Trufemcels_comments.csv",
            "start_date": "2019-01-30"
        },
    "mensrights": 
        {
            "filename": prefix + "MensRights_comments.csv",
            "start_date": "2021-01-01"
        },
    # add TheRedPill here
    "feminism_full": 
        {
            "filename": prefix + "Feminism_comments.csv", 
            "start_date": "2015-11-07"
        },
    "fourthwavewomen": 
        {
            "filename": prefix + "fourthwavewomen_comments.csv",
            "start_date": "2021-01-01"
        }
    # add IntersectionalFems here
    # add Intersectionality here
    # add RadicalFeminism here
}

In [106]:
for subreddit in datasets:
    df = preprocess(**datasets[subreddit])
    df.to_pickle("data/clean/"+subreddit+".pkl")

  df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])


2.762253761291504


In [None]:
# break feminism into time periods
feminism_full = pd.read_pickle("data/clean/feminism_full.pkl")
feminism_chunks = []
for i in range(2015, 2023, 2):
    feminism_chunk = feminism_full[(feminism_full.timestamp >= str(i)+"-01-01") & (feminism_full.timestamp < str(i+2)+"-01-01")]
    feminism_chunk.to_pickle("data/clean/feminism_"+str(i)+"_"+str(i+2)+".pkl")

In [None]:
# combine incels into one df
incel_subreddits = ["incels", "braincels", "trufemcels", "mensrights"] # add TheRedPill here
incel_dfs = [pd.read_pickle("data/clean/"+subreddit+".pkl") for subreddit in incel_subreddits]
full_df = pd.concat(incel_dfs)
full_df.to_pickle("data/clean/incels_full.pkl")

In [None]:
def load_df(subreddit, prefix="data/clean/"):
    df = pd.read_pickle(prefix+subreddit+".pkl")
    return df
subreddits = ["incels", "braincels", "trufemcels", "mensrights", "incels_full","feminism_full", "feminism_2015_2017", "feminism_2017_2019", "feminism_2019_2021", "feminism_2021_2023"]
for subreddit in subreddits:
    print(load_df(subreddit)["body_clean"].head())

id
d38cpyp                        [reply, to, you, on, trucels]
d38dsz9    [seduction, be, disastrous, for, woman, since,...
d38e0dh    [thank, you, it, hearten, to, see, more, and, ...
d3ej70j    [you, describe, yourself, a, non, chad, would,...
d3ek59w    [rate, my, attractiveness, maybe, out, of, my,...
Name: body_clean, dtype: object
id
doo7s2k    [why, do, not, have, fifty, subscriber, alread...
doo85t6    [here, some, previous, essay, story, and, poet...
dopbxoz                     [subscribe, do, get, an, upvote]
dopbyxv    [hate, psychology, it, brainwash, gimmick, to,...
dopcfvs    [damn, literally, study, cognitive, behavioral...
Name: body_clean, dtype: object
id
efb6vdz    [isn, the, girl, from, vampire, diary, bulgari...
efb6zku    [even, for, satire, eww, blehh, doesn, the, av...
efb72bt    [eh, wouldn, say, beautiful, woman, feel, unco...
efb7b2g    [yeah, not, say, she, doesn, face, her, own, i...
efb7c7o                                  [all, the, fresher]
Name: body_c