In [5]:
# PRE-SAGE TOKENIZATION

'''
This code block takes in our csv data and returns a simple tokenized
list of words for SAGE analysis. Several methods are included for robustness
'''
    
import operator
import os, math
import string
import requests
import numpy as np
import random

# -------------------------------------------------------------

import pandas as pd

CSV_FILE = "comments_sample.csv"

df = pd.read_csv(CSV_FILE)
subreddit = df.subreddit.tolist()
body = df.body.tolist()

# create list of hate subreddits for parsing
HATERS = ['The_Donald', '4chan4trump', 'KotakuInAction', 'CringeAnarchy']

# parse list of hate subs
def parse(unfiltered):
    
    hate_sub = []
    hate_corpus = []
    base_corpus = []
    
    for row in unfiltered:
        key = str(row)
        if any(key in h for h in HATERS):
            hate_sub.extend([1])
        else:
            hate_sub.extend([0])
    
    final = dict(zip(body, hate_sub))
    
    for key, value in final.items():
        key = str(key)
        base_corpus.extend([key])
        if value == 1:
            hate_corpus.extend([key])
            
    base_corpus = ' '.join(base_corpus)
    hate_corpus = ' '.join(hate_corpus)
    
    return base_corpus, hate_corpus
        
# define hate and base corpus
base_corpus, hate_corpus = parse(subreddit)

# -------------------------------------------------------------

'''
This is a base tokenization method
'''

def word_tokenize_naive(s):
    return s.split()

# -------------------------------------------------------------

'''
More advanced tokenization, with stemming process
'''

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

#hate_corpus = word_tokenize_naive(hate_corpus)

def stemmer(corpus):
    final = []
    for w in corpus:
        this = ps.stem(w)
        final.extend([this])
    return final

#stemmer(hate_corpus)

['came',
 'to',
 'say',
 '128692176',
 '^canada',
 'anonym',
 '(id:',
 'treynzqd)',
 '&gt;&gt;128692072',
 'yea',
 'but',
 'she',
 'die',
 'in',
 'a',
 'snuff',
 'film',
 'shortli',
 'after',
 'these',
 'were',
 'taken',
 '&gt;',
 'that',
 'sound',
 'noth',
 'like',
 'a',
 'thug.',
 'that',
 'sound',
 'like',
 'a',
 'terrorist.',
 'thug',
 'noun',
 'A',
 'person',
 'who',
 'treat',
 'other',
 'violent',
 'and',
 'roughly,',
 'thug',
 'n.',
 '1.',
 'a',
 'viciou',
 'crimin',
 'or',
 'ruffian.',
 'sound',
 'about',
 'right',
 'to',
 'me.',
 'bought',
 'a',
 'pair',
 'just',
 'a',
 'few',
 'day',
 'ago.',
 'By',
 'coincid',
 'they',
 'are',
 'exactli',
 'same',
 'model',
 'and',
 'color',
 'as',
 "baron's.",
 'meme',
 'magic',
 'is',
 'real.',
 'and',
 "that'",
 'about',
 'all',
 'she',
 'got',
 'right.',
 'dank',
 'pinochet',
 'meme',
 'but',
 'they',
 "don't",
 'believ',
 'in',
 'logic.',
 'they',
 'believ',
 "it'",
 'a',
 'construct',
 'within',
 'relativism,',
 'rather',
 'than',
 'ex