## Data processing

Takes raw data in csvs.
Filters and creates datasets for specific time periods.

In [71]:
from typing import List
import pandas as pd
from gensim.models import Phrases
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
import re

def preprocess(filename=None, start_date=None, end_date=None):
    # get data from csv
    df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])
    
    # filter by date
    if start_date:
        df = df[df.timestamp >= start_date]
    if end_date:
        df = df[df.timestamp < end_date]

    # remove rows where comment was deleted
    df = df.drop(df[df.body == "[deleted]"].index)

    # remove links, convert to lowercase, tokenize, and remove short/long tokens
    def remove_links_and_simple_preprocess(sentence):
        sentence = re.sub(r'https?:\/\/.*', '', sentence)
        return simple_preprocess(sentence)
    simple_preprocessed = df['body'].astype(str).apply(remove_links_and_simple_preprocess)

    # function to lemmatize each token, based on its part of speech
    lemmatizer = WordNetLemmatizer()
    def lemmatize_token(t: str, pos:str):
        morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
                  'VB':wn.VERB, 'RB':wn.ADV}
        try:
            pos = morphy_tag[pos[:2]]
        except:
            pos = None
        if pos is not None:
            lemma = lemmatizer.lemmatize(t, pos)
        else:
            lemma = lemmatizer.lemmatize(t)
        return lemma
    
    # tokenize comments, preserving common bigram phrases
    # identify common bigram phrases
    phrases = Phrases(simple_preprocessed, scoring="npmi", threshold=0.7)
    def preproccess_sentence(preprocessed_sentence: str) -> List[str]:
        # combine tokens that make up a phrase and drop associated score
        simple_tokens = [t[0] for t in phrases.analyze_sentence(preprocessed_sentence)]
        # lemmatize tokens
        tokens_and_pos = pos_tag(simple_tokens)
        tokens = [lemmatize_token(t, pos) for t, pos in tokens_and_pos]
        return tokens
    
    # create body_clean column: a preprocessed version of body
    df['body_clean'] = simple_preprocessed.apply(preproccess_sentence)
    return df

In [72]:
datasets = {
    "incels": 
        {
            "filename": "data/Incels_comments.csv",
            "start_date": "2015-11-07"
        },
    "braincels": 
        {
            "filename": "data/Braincels_comments.csv",
            "start_date": "2017-09-30"
        },
    "trufemcels": 
        {
            "filename": "data/Trufemcels_comments.csv",
            "start_date": "2019-01-30"
        },
    "mensrights": 
        {
            "filename": "data/MensRights_comments.csv",
            "start_date": "2021-01-30"
        },
    "feminism": 
        {
            "filename": "data/Feminism_comments.csv", 
            "start_date": "2015-11-07"
        },
    "fourthwavewomen": 
        {
            "filename": "data/fourthwavewomen_comments.csv",
            "start_date": "2021-01-30"
        }
}

In [73]:
df = preprocess(**datasets["feminism"]).head()
df

  df = pd.read_csv(filename, index_col="id", usecols=["id", "body", "author", "parent_id", "retrieved_on", "timestamp"], parse_dates=['timestamp'])


Unnamed: 0_level_0,body,author,parent_id,retrieved_on,timestamp,body_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cwrahh1,There's an [article by Wendy Faulkner](http://...,nicksenada,t3_3rq92m,1449646465,2015-11-07 00:14:56,"[there, an, article, by, wendy, faulkner, unfo..."
cwralnk,a lot of people might think it's graphic to ta...,coolfurrcats,t3_3rtshu,1449646520,2015-11-07 00:17:04,"[lot, of, people, might, think, it, graphic, t..."
cwraz6x,people often hate on feminists for taking offe...,coolfurrcats,t3_3rt3qq,1449646698,2015-11-07 00:29:52,"[people, often, hate, on, feminist, for, take,..."
cwrb08a,I've read before that these farms are also res...,nicksenada,t3_3rrlcj,1449646711,2015-11-07 00:29:52,"[ve, read, before, that, these, farm, be, also..."
cwrb4vq,[this one?](http://www.nrcs.usda.gov/Internet/...,nicksenada,t3_3rtye2,1449646772,2015-11-07 00:34:08,"[this, one]"


In [74]:
df.to_pickle("./pickle_test.pkl")

In [75]:
new_df = pd.read_pickle("./pickle_test.pkl")
new_df.head()

Unnamed: 0_level_0,body,author,parent_id,retrieved_on,timestamp,body_clean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cwrahh1,There's an [article by Wendy Faulkner](http://...,nicksenada,t3_3rq92m,1449646465,2015-11-07 00:14:56,"[there, an, article, by, wendy, faulkner, unfo..."
cwralnk,a lot of people might think it's graphic to ta...,coolfurrcats,t3_3rtshu,1449646520,2015-11-07 00:17:04,"[lot, of, people, might, think, it, graphic, t..."
cwraz6x,people often hate on feminists for taking offe...,coolfurrcats,t3_3rt3qq,1449646698,2015-11-07 00:29:52,"[people, often, hate, on, feminist, for, take,..."
cwrb08a,I've read before that these farms are also res...,nicksenada,t3_3rrlcj,1449646711,2015-11-07 00:29:52,"[ve, read, before, that, these, farm, be, also..."
cwrb4vq,[this one?](http://www.nrcs.usda.gov/Internet/...,nicksenada,t3_3rtye2,1449646772,2015-11-07 00:34:08,"[this, one]"
