In [11]:
# PRE-SAGE TOKENIZATION

'''
This code block takes in our csv data and returns a simple tokenized
list of words for SAGE analysis. Several methods are included for robustness
'''
    
import operator
import os, math
import string
import requests
import numpy as np
import random

# -------------------------------------------------------------

import pandas as pd

CSV_FILE = "comments_sample.csv"

df = pd.read_csv(CSV_FILE)
subreddit = df.subreddit.tolist()
body = df.body.tolist()

# create list of hate subreddits for parsing
HATERS = ['The_Donald', '4chan4trump', 'KotakuInAction', 'CringeAnarchy']

# parse list of hate subs
def parse(unfiltered):
    
    hate_sub = []
    hate_corpus = []
    base_corpus = []
    
    for row in unfiltered:
        key = str(row)
        if any(key in h for h in HATERS):
            hate_sub.extend([1])
        else:
            hate_sub.extend([0])
    
    final = dict(zip(body, hate_sub))
    
    for key, value in final.items():
        key = str(key)
        base_corpus.extend([key])
        if value == 1:
            hate_corpus.extend([key])
            
    base_corpus = ' '.join(base_corpus)
    hate_corpus = ' '.join(hate_corpus)
    
    return base_corpus, hate_corpus
        
# define hate and base corpus
base_corpus, hate_corpus = parse(subreddit)

# -------------------------------------------------------------

'''
This is a base tokenization method
'''

def word_tokenize_naive(s):
    return s.split()

# -------------------------------------------------------------

'''
More advanced tokenization, with stemming process
'''

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

def stemmer(corpus):
    final = []
    for w in corpus:
        this = ps.stem(w)
        final.extend([this])
    return final

# -------------------------------------------------------------

'''
More more advanced tokenization, with lemmatization process
'''

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

def lemmatizer(corpus):
    final = []
    for w in corpus:
        this = lemma.lemmatize(w)
        final.extend([this])
    return final

#hate_corpus = word_tokenize_naive(hate_corpus)
#lemmatizer(hate_corpus)

['Came',
 'to',
 'say',
 '128692176',
 '^Canada',
 'Anonymous',
 '(ID:',
 'treYNZqD)',
 '&gt;&gt;128692072',
 'yea',
 'but',
 'she',
 'died',
 'in',
 'a',
 'snuff',
 'film',
 'shortly',
 'after',
 'these',
 'were',
 'taken',
 '&gt;',
 'That',
 'sound',
 'nothing',
 'like',
 'a',
 'thug.',
 'That',
 'sound',
 'like',
 'a',
 'terrorist.',
 'thug',
 'noun',
 'A',
 'person',
 'who',
 'treat',
 'others',
 'violently',
 'and',
 'roughly,',
 'thug',
 'n.',
 '1.',
 'a',
 'vicious',
 'criminal',
 'or',
 'ruffian.',
 'Sounds',
 'about',
 'right',
 'to',
 'me.',
 'Bought',
 'a',
 'pair',
 'just',
 'a',
 'few',
 'day',
 'ago.',
 'By',
 'coincidence',
 'they',
 'are',
 'exactly',
 'same',
 'model',
 'and',
 'color',
 'a',
 "Baron's.",
 'Meme',
 'magic',
 'is',
 'real.',
 'And',
 "that's",
 'about',
 'all',
 'she',
 'got',
 'right.',
 'Dank',
 'pinochet',
 'meme',
 'But',
 'they',
 "don't",
 'believe',
 'in',
 'logic.',
 'They',
 'believe',
 "it's",
 'a',
 'construction',
 'within',
 'relativism,',
