In [1]:
# PRE-SAGE TOKENIZATION

'''
This code block takes in our csv data and returns a simple tokenized
list of words for SAGE analysis. Several methods are included for robustness
'''
    
import operator
import os, math
import string
import requests
import numpy as np
import random

# -------------------------------------------------------------

import pandas as pd

CSV_FILE = "comments_sample.csv"

df = pd.read_csv(CSV_FILE)
subreddit = df.subreddit.tolist()
body = df.body.tolist()

# create list of hate subreddits for parsing
HATERS = ['The_Donald', '4chan4trump', 'KotakuInAction', 'CringeAnarchy']

# parse list of hate subs
def parse(unfiltered):
    
    hate_sub = []
    hate_corpus = []
    base_corpus = []
    
    for row in unfiltered:
        key = str(row)
        if any(key in h for h in HATERS):
            hate_sub.extend([1])
        else:
            hate_sub.extend([0])
    
    final = dict(zip(body, hate_sub))
    
    for key, value in final.items():
        key = str(key)
        base_corpus.extend([key])
        if value == 1:
            hate_corpus.extend([key])
            
    base_corpus = ' '.join(base_corpus)
    hate_corpus = ' '.join(hate_corpus)
    
    return base_corpus, hate_corpus
        
# define hate and base corpus
base_corpus, hate_corpus = parse(subreddit)

# -------------------------------------------------------------

'''
This is a base tokenization method
'''

def word_tokenize_naive(s):
    return s.split()

hate_tkn = word_tokenize_naive(hate_corpus)
base_tkn = word_tokenize_naive(base_corpus)

# -------------------------------------------------------------

'''
More more advanced tokenization, with lemmatization process
'''

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

def lemmatizer(corpus):
    final = []
    for w in corpus:
        this = lemma.lemmatize(w)
        final.extend([this])
    return final

hate = lemmatizer(hate_tkn)
base = lemmatizer(base_tkn)

In [10]:
#SAGE IMPLEMENTATION FOR BASELINE

'''
This code block takes in our tokenized data and preps for SAGE scoring
 
Citation:
 
Eisenstein, Jacob, Amr Ahmed, and Eric P. Xing. "Sparse Additive Generative Models of Text."
Proceedings of the 28th International Conference on Machine Learning (ICML-11). 2011.
'''

# -------------------------------------------------------------

from collections import Counter
import numpy as np
    
# counts for hate subreddit
hate_counts = Counter(hate)

# counts for all subreddits in corpus
base_counts = Counter(base)

# -------------------------------------------------------------

# build vocab of most common terms
vocab = [word for word,count in hate_counts.most_common(10000)]

# convert into numpy arrays
x_hate = np.array([hate_counts[word] for word in vocab])
x_base = np.array([base_counts[word] for word in vocab]) + 1.

# Compute the base log-probabilities of each word
mu = np.log(x_base) - np.log(x_base.sum())


[ -3.12785436  -2.91486607  -3.24386842 ... -11.47922906  -9.46432604
  -9.97515167]


In [11]:
'''
This code block determines list of k most common words in hate subreddits
 
Citation:
 
Eisenstein, Jacob, Amr Ahmed, and Eric P. Xing. "Sparse Additive Generative Models of Text."
Proceedings of the 28th International Conference on Machine Learning (ICML-11). 2011.
'''

import sage

# Run SAGE
eta = sage.estimate(x_hate, mu)

# -------------------------------------------------------------

# Print words especially frequent in hate subreddits compared to the baseline
hate_words = sage.topK(eta, vocab)
print(hate_words)

SyntaxError: invalid syntax (<ipython-input-11-d697d1ceed18>, line 18)