## Data processing

Takes raw data in csvs.
Filters and creates datasets for specific time periods.

In [103]:
from typing import List
import pandas as pd
from gensim.models import Phrases
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
import re
from nltk.stem.util import suffix_replace
from urllib.parse import urlparse

def preprocess(filename=None, start_date=None, end_date=None):
    # get data from csv
    df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])
    
    # filter by date
    if start_date:
        df = df[df.timestamp >= start_date]
    if end_date:
        df = df[df.timestamp < end_date]

    # remove rows where comment was deleted
    df = df.drop(df[df.body == "[deleted]"].index)
    df = df.drop(df[df.body == "[removed]"].index)

    # remove links, convert to lowercase, remove html special characters, tokenize, and remove short/long tokens
    def pre_preprocessing(sentence):
        spl = sentence.split()
        for i, word in enumerate(spl):
            parsed_url = urlparse(word)
            if parsed_url.scheme and parsed_url.netloc:
                spl[i] = "removed_url"
        sentence = ' '.join(spl)
        sentence = sentence.replace("&amp;", "and")
        sentence = sentence.replace("&gt;", "")
        # men and man, female and females, incel and incels, marginalized and marginalised, libfem and libfems
        return simple_preprocess(sentence)
    simple_preprocessed = df['body'].astype(str).apply(pre_preprocessing)

    # function to lemmatize each token, based on its part of speech
    lemmatizer = WordNetLemmatizer()
    def lemmatize_token(t: str, pos:str):
        morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
                  'VB':wn.VERB, 'RB':wn.ADV}
        try:
            pos = morphy_tag[pos[:2]]
        except:
            pos = wn.NOUN
        lemma = lemmatizer.lemmatize(t, pos)
        return lemma
    
    # task specific post-preprocessing
    def post_preprocessing(t):
        if t == "men":
            t = suffix_replace(t, "en", "an")
        elif t in ["incels", "libfems", "females"]:
            t = suffix_replace(t, "s", "")
        elif t in ["marginalised"]:
            t = suffix_replace(t, "ised", "ized")
        return t
    
    # tokenize comments, preserving common bigram phrases
    # identify common bigram phrases
    phrases = Phrases(simple_preprocessed, scoring="npmi", threshold=0.7)
    def preproccess_sentence(preprocessed_sentence: str) -> List[str]:
        # combine tokens that make up a phrase and drop associated score
        simple_tokens = [t[0] for t in phrases.analyze_sentence(preprocessed_sentence)]
        # lemmatize tokens
        tokens_and_pos = pos_tag(simple_tokens)
        tokens = [lemmatize_token(t, pos) for t, pos in tokens_and_pos]
        tokens = [post_preprocessing(t) for t in tokens]
        return tokens
    
    # create body_clean column: a preprocessed version of body
    df['body_clean'] = simple_preprocessed.apply(preproccess_sentence)
    return df

In [104]:
prefix = "../data/raw/"
datasets = {
    "incels": 
        {
            "filename": prefix + "Incels_comments.csv",
            "start_date": "2015-11-07"
        },
    "braincels": 
        {
            "filename": prefix + "Braincels_comments.csv",
            "start_date": "2017-09-30"
        },
    "trufemcels": 
        {
            "filename": prefix + "Trufemcels_comments.csv",
            "start_date": "2019-01-30"
        },
    "mensrights": 
        {
            "filename": prefix + "MensRights_comments.csv",
            "start_date": "2021-01-01"
        },
    "theredpill": 
        {
            "filename": prefix + "TheRedPill_comments.csv",
            "start_date": "2021-01-01"
        },
    "feminism_full": 
        {
            "filename": prefix + "Feminism_comments.csv", 
            "start_date": "2015-11-07"
        },
    "fourthwavewomen": 
        {
            "filename": prefix + "fourthwavewomen_comments.csv",
            "start_date": "2021-01-01"
        },
    "women": 
        {
            "filename": prefix + "women_comments.csv",
            "start_date": "2021-01-01"
        },
    "feminisms": 
        {
            "filename": prefix + "feminisms_comments.csv",
            "start_date": "2021-01-01"
        },
    "blackladies": 
        {
            "filename": prefix + "blackladies_comments.csv",
            "start_date": "2021-01-01"
        },
    "feminismuncensored": 
        {
            "filename": prefix + "FeminismUncensored_comments.csv",
            "start_date": "2021-01-01"
        },
    "fireyfemmes": 
        {
            "filename": prefix + "FIREyFemmes_comments.csv",
            "start_date": "2021-01-01"
        }
}

In [111]:
for subreddit in datasets:
    try:
        df = preprocess(**datasets[subreddit])
        df.to_pickle("../data/clean/"+subreddit+".pkl")
    except:
        print("FAILED: " + subreddit)

  df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])
  df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])
  df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])
  df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])
  df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])


In [112]:
# break feminism into time periods
feminism_full = pd.read_pickle("../data/clean/feminism_full.pkl")
feminism_chunks = []
for i in range(2015, 2023, 2):
    feminism_chunk = feminism_full[(feminism_full.timestamp >= str(i)+"-01-01") & (feminism_full.timestamp < str(i+2)+"-01-01")]
    feminism_chunk.to_pickle("../data/clean/feminism_"+str(i)+"_"+str(i+2)+".pkl")

In [113]:
# combine incels into one df
incel_subreddits = ["incels", "braincels", "trufemcels", "mensrights"] # add the redpill
incel_dfs = [pd.read_pickle("../data/clean/"+subreddit+".pkl") for subreddit in incel_subreddits]
full_df = pd.concat(incel_dfs)
full_df.to_pickle("../data/clean/incels_full.pkl")

In [7]:
import plotly.express as px
import pandas as pd
# build timeline
def load_df(subreddit, prefix="../data/clean/"):
    df = pd.read_pickle(prefix+subreddit+".pkl")
    return df
subreddits = ["incels", "braincels", "trufemcels", "mensrights", "incels_full","feminism_full", "feminism_2015_2017", "feminism_2017_2019", "feminism_2019_2021", "feminism_2021_2023", "fourthwavewomen", "women", "blackladies", "feminismuncensored", "feminisms", "fireyfemmes"]
timeline_info = []
for subreddit in subreddits:
    df = load_df(subreddit)
    info_dict = {}
    info_dict["Dataset"] = subreddit
    info_dict["Start"] = df['timestamp'].min()
    info_dict["End"] = df['timestamp'].max()
    info_dict["Community"] = "Incel" if subreddit in ["incels", "braincels", "trufemcels", "mensrights", "incels_full"] else "Feminism"
    timeline_info.append(info_dict)

# In line with steering :)
colors = {
    "Incel": "rgb(242,172,185)",
    "Feminism": "rgb(155,211,221)"
}
timeline_df = pd.DataFrame(timeline_info)
fig = px.timeline(timeline_df, x_start="Start", x_end="End", y="Dataset", color="Community", color_discrete_map=colors)

In [8]:
fig.show()