In [2]:
import os
import csv
import sage
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

tqdm.pandas(tqdm_notebook)
data_path = "/home/dfsnow/rbans/data/"

In [3]:
# Create ekphrasis preprocessor class
ekphrasis_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'],  # normalize terms
    fix_html=True,  # fix HTML tokens  
    segmenter="english",  # corpus for word segmentation
    corrector="english",  # corpus for spell correction
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # unpack contractions 
    spell_correct_elong=False,  # spell correction for elongated words
    dicts=[emoticons]  # replace emojis with words
)

flatten = lambda l: [item for sublist in l for item in sublist]

Reading english - 1grams ...
Reading english - 2grams ...
Reading english - 1grams ...


## Shuffling

This section is dedicated to ensuring that the sample drawn from Postgres is sufficiently shuffled.

In [None]:
# Load the shuffled main training data into memory
train = pd.read_csv(os.path.join(data_path, "main_data_sample.csv"))

In [None]:
# Shuffle all the training data to ensure a random distribution
train = shuffle(train)

In [None]:
# Save the shuffled data to disk
train.to_csv(os.path.join(data_path, "main_data_shuffled.csv"), quoting=csv.QUOTE_NONNUMERIC)

## Preprocessing

This section is dedicated to preprocessing all of the data using ekphrasis. Doing this in chunks is more efficient, since ekphrasis takes quite awhile to run. After processing all chunks, we concatenate them back together in the command line and load the cleaned data from here on.

In [None]:
# Code for starting after the last completed chunk
chunksize = 100000
shuffled_data = os.path.join(data_path, "main_data_shuffled.csv")
start_iter = max([int(chunk[0:4]) for chunk in os.listdir(os.path.join(data_path, 'split'))]) + 1
start_row = start_iter * chunksize
reader = pd.read_csv(
    shuffled_data, index_col=0, skiprows=start_row, chunksize=chunksize,
    names=["index", "id", "score", "body", "label"]
)

# Load chunk of overall dataframe into memory, process, then write to CSV
for i, chunk in enumerate(reader):
    chunk["body"] = chunk.body.map(ekphrasis_processor.pre_process_doc)
    chunk.to_csv(
        os.path.join(data_path, "split/" + str(i + start_iter).zfill(4) + "_preprocessed_chunk.csv"),
        quoting=csv.QUOTE_NONNUMERIC,
        header=False, index=False
    )

In [None]:
vocab = Counter(flatten([(ekphrasis_tokenizer(body)) for body in train[1:500].body])).most_common(500)

In [None]:
# Take a random sample of 10 mil posts for SAGE
base_posts = train.loc[train.label == 0].body.sample(n=10000000)
hate_posts = train.loc[train.label == 1].body.sample(n=1000000)

In [None]:
# Get word counts for the hate and non-hate samples, cleanining with ekphrasis
base_counts = dict(Counter(flatten([(ekphrasis_processor.pre_process_doc(body)) for body in base_posts])))
hate_counts = flatten([(ekphrasis_processor.pre_process_doc(body)) for body in hate_posts])))

In [None]:
# Get the most common hate words
hate_vocab = [word for word,count in Counter(hate_counts).most_common(100000)]

In [None]:
# Convert the counts into numerically comparable arrays
hate_array = np.array([hate_counts.get(word,0) for word in hate_vocab])
base_array = np.array([base_counts.get(word,0) for word in hate_vocab]) + 1.

In [None]:
# Use the SAGE algorithm to get the top K words from hate subs
mu = np.log(base_array) - np.log(base_array.sum())
beta = sage.estimate(hate_array, mu)
hate_words = sage.topK(beta, hate_vocab, 10000)
print(hate_words)

In [None]:
# Save the hate words to a list
with open(os.path.join(data_path, "hate_words.csv"), "w") as f:
    writer = csv.writer(f)
    for word in hate_words:
        writer.writerow([word])

In [None]:
# Split hate and nonhate datasets into train, test, and validate
train, test = train_test_split(train, test_size=0.2, random_state=2) 
train, validate = train_test_split(train, test_size=0.2, random_state=2) 