In [20]:
import operator
import csv
import os, math, sys
import string
import re
import ast
import numpy as np
import pandas as pd
import random
import time
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.nn import functional as F
from torchtext import data
from torchtext import datasets
from torchtext import vocab
from itertools import product
from multiprocessing import Pool, cpu_count
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.dicts.emoticons import emoticons

# Import model and model helper functions
sys.path.append("..")
import src.fasttext as ft
import src.fasttext_utils as ftu
from src.vaderSentiment import SentimentIntensityAnalyzer

data_dir = '../data'  
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [21]:
# Initialize the model with the following params
model = torch.load(os.path.join(data_dir, 'model/NN_fasttext_model.pt'))
model.eval()
with open(os.path.join(data_dir, 'model/NN_fasttext_data.pkl'), 'rb') as input:
    TEXT = pickle.load(input)

In [22]:
ekphrasis_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'],  # normalize terms
    fix_html=True,  # fix HTML tokens  
    segmenter="english",  # corpus for word segmentation
    corrector="english",  # corpus for spell correction
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # unpack contractions 
    spell_correct_elong=False,  # spell correction for elongated words
    tokenizer=ftu.reg_tokenize,
    dicts=[emoticons]  # replace emojis with words
)


def predict_from_preprocessed(sentence):
    tokenized = ast.literal_eval(sentence)
    if len(tokenized) == 0:
        return 0.0
    else:
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]
        tensor = torch.LongTensor(indexed).to(device)
        tensor = tensor.unsqueeze(1)
        prediction = torch.round(torch.sigmoid(model(tensor)))
        return prediction.item()
    
    
def predict_from_sentence(sentence):
    tokenized = ftu.generate_bigrams([tok.lower() for tok in ekphrasis_processor.pre_process_doc(sentence)])
    if len(tokenized) == 0:
        return 0.0
    else:
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]
        tensor = torch.LongTensor(indexed).to(device)
        tensor = tensor.unsqueeze(1)
        prediction = torch.round(torch.sigmoid(model(tensor)))
        return prediction.item()


Reading english - 1grams ...
Reading english - 2grams ...
Reading english - 1grams ...


In [23]:
sent = """I hate jews and hilary clinton"""

if predict_from_sentence(sent) == 1.0:
    print("This is hate speech!")
else:
    print("This is not hate speech!")

This is hate speech!


In [52]:
def concat_chunks(filelist):
    file_date = filelist[0][-11:-4]
    df = pd.concat((pd.read_csv(
        file,
        names=["id", "date", "author", "subreddit", "body", "sentiment"],
        dtype={"id": str, "date": str, "author": str, "subreddit": str, "body": str, "sentiment": float},
        ) for file in filelist)) 
    print("Finished concatenating", file_date)
    df["classification"] = df.body.map(predict_from_preprocessed)
    print("Finished classifying", file_date)
    df["is_hate"] = (df.classification == 1.0) & (df.sentiment < -0.05).astype(int)
    df.drop(columns=["body", "id"], inplace=True)
    df.to_csv(
        os.path.join(data_dir, "analysis/concat_applied_data_" + file_date + ".csv"),
        quoting=csv.QUOTE_NONNUMERIC,
        header=True, index=False
    )


In [53]:
completed_jobs = os.listdir(os.path.join(data_dir, 'split/'))

batch = []
for year in range(2015, 2020):
    for month in range(1, 13):
        batch.append([
            os.path.join(data_dir, 'split/' + x) for
            x in completed_jobs if x[-11:-4] == str(year) + '_' + str(month).zfill(2)])
        
batch = [x for x in batch if x != []]

In [54]:
concat_chunks(batch[-1])

Finished concatenating 2019_02
Finished classifying 2019_02
