In [1]:
# PRE-RNN TOKENIZATION

'''
This code block takes in our csv data and returns a simple tokenized
tensor for use in an RNN model
'''
    
import operator
import os, math
import string
import requests
import numpy as np
import random
import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter, defaultdict
import copy

# -------------------------------------------------------------

# set the random seeds so the experiments can be replicated exactly
seed = 30255
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

# Global class labels
HATE_LABEL = "hate"
BASE_LABEL = "nonhate"     

In [27]:
import pandas as pd

CSV_FILE = "comments_sample.csv"

df = pd.read_csv(CSV_FILE)

# create list of hate subreddits for parsing
HATERS = ['The_Donald', '4chan4trump', 'KotakuInAction', 'CringeAnarchy']

# takes in csv and spits out strings for the list
def stringer(unfiltered):
    
    subreddit = unfiltered.subreddit.tolist()
    body = unfiltered.body.tolist()
    
    temp = [str(i) for i in body]
    
    final = [list(a) for a in zip(temp, subreddit)]
    
    return final

# takes in csv and spits out classified body, paired to id
def findHaters(unfiltered):
    
    subreddit = [item[1] for item in unfiltered]
    body = [item[0] for item in unfiltered]
    
    filtered = []
    
    for row in subreddit:
        key = str(row)
        if any(key in h for h in HATERS):
            filtered.extend([HATE_LABEL])
        else:
            filtered.extend([BASE_LABEL])
    
    final = [list(a) for a in zip([b.split() for b in body], filtered)]
            
    return final
        
f = stringer(df)
g = findHaters(f)



In [40]:
'''
More more advanced tokenization, with lemmatization process
'''

import nltk

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

def lemmatizer(corpus):
    
    body = [item[0] for item in corpus]
    classifier = [item[1] for item in corpus]
    
    midpoint = []
    
    for s in body:
        temp = []
        
        for w in s:
            this = lemma.lemmatize(w)
            temp.append(this)
        
        midpoint.append(temp)
    
    final = [list(a) for a in zip(midpoint, classifier)]
    
    return final

fin = lemmatizer(g)
print(fin)

