In [2]:
# PRE-RNN TOKENIZATION

'''
This code block takes in our csv data and returns a simple tokenized
tensor for use in an RNN model
'''
    
import operator
import os, math
import string
import requests
import numpy as np
import random
import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter, defaultdict
import copy

# -------------------------------------------------------------

# set the random seeds so the experiments can be replicated exactly
seed = 30255
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

# Global class labels
HATE_LABEL = "hate"
BASE_LABEL = "nonhate"     

In [6]:
import pandas as pd

CSV_FILE = "comments_sample.csv"

df = pd.read_csv(CSV_FILE)

# create list of hate subreddits for parsing
HATERS = ['The_Donald', '4chan4trump', 'KotakuInAction', 'CringeAnarchy']

# takes in csv and spits out strings for the list
def stringer(unfiltered):
    
    subreddit = unfiltered.subreddit.tolist()
    body = unfiltered.body.tolist()
    
    temp = [str(i) for i in body]
    
    final = [list(a) for a in zip(temp, subreddit)]
    
    return final

# takes in csv and spits out classified body
def findHaters(unfiltered):
        
    full_body = [item[0] for item in unfiltered]
    
    filtered = []
    hate_body = []
    hate_mark = []
    
    for body, key in unfiltered:
        keys = str(key)
        if any(keys in h for h in HATERS):
            filtered.extend([HATE_LABEL])
            hate_body.extend([body])
            hate_mark.extend([HATE_LABEL])
        else:
            filtered.extend([BASE_LABEL])
    
    tot_final = [list(a) for a in zip([b.split() for b in full_body], filtered)]
    hate_final = [list(a) for a in zip([b.split() for b in hate_body], hate_mark)]
            
    return tot_final, hate_final
        
f = stringer(df)
g,h = findHaters(f)

In [8]:
'''
More more advanced tokenization, with lemmatization process
'''

import nltk

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

def lemmatizer(corpus):
    
    body = [item[0] for item in corpus]
    classifier = [item[1] for item in corpus]
    
    midpoint = []
    
    for s in body:
        temp = []
        
        for w in s:
            this = lemma.lemmatize(w)
            temp.append(this)
        
        midpoint.append(temp)
    
    final = [list(a) for a in zip(midpoint, classifier)]
    
    return final

fin_base = lemmatizer(g)
fin_hate = lemmatizer(h)

In [None]:
# SAGE IMPLEMENTATION FOR BASELINE

'''
This code block takes in our tokenized data and returns a dictionary pairing
words from the tokenized data and their SAGE score for further use
 
Citation:
 
Eisenstein, Jacob, Amr Ahmed, and Eric P. Xing. "Sparse Additive Generative Models of Text."
Proceedings of the 28th International Conference on Machine Learning (ICML-11). 2011.
'''

import sage
from collections import Counter

# define counter helper function
def getCountDict(filename):
    with open(filename) as fin:
        return {word:int(count) for word,count in [line.rstrip().split() for line in fin.readlines()]}
    
# counts for hate subreddit
hate_counts = getCountDict(fin_hate)

# counts for all subreddits in corpus
base_counts = getCountDict(fin_base)

# -------------------------------------------------------------

vocab = [word for word,count in Counter(hate_counts).most_common(5000)]

x_hate = np.array([hate_counts[word] for word in vocab])
x_base = np.array([base_counts[word] for word in vocab]) + 1.

# Compute the base log-probabilities of each word
mu = np.log(x_base) - np.log(x_base.sum())

# Run SAGE
eta = sage.estimate(x_hate,mu)

# -------------------------------------------------------------

# Print words especially frequent in subreddit compared to the baseline
print sage.topK(eta,vocab)

In [None]:
'''
This code block subsets our hate sample based on a list of hate words from SAGE
 
'''

# assumes list from SAGE of top 10000
# assumes lemmatized data in [[["this", "format"], 'hate'], [["right", "here"], 'nonhate']]
SAGE = hate_list
HATE = hate_lemma

# ----------------------------------------------------------

def subsetter(hate_lemma, hate_list):
    
    subset = []
    
    for x in hate_lemma:
        for body, key in x:
            for bit in body:
                if bit in hate_list:
                    if x not in subset:
                        subset.extend(x)
    return subset
            
hate_subset = subsetter(HATE)