In [1]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

tqdm.pandas(tqdm_notebook)

In [2]:
# Create ekphrasis preprocessor class
ekphrasis_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'],  # normalize terms
    fix_html=True,  # fix HTML tokens  
    segmenter="english",  # corpus for word segmentation
    corrector="english",  # corpus for spell correction
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # unpack contractions 
    spell_correct_elong=False,  # spell correction for elongated words
    dicts=[emoticons]  # replace emojis with words
)

ekphrasis_tokenizer = SocialTokenizer(lowercase=False).tokenize
flatten = lambda l: [item for sublist in l for item in sublist]

Reading english - 1grams ...
Reading english - 2grams ...
Reading english - 1grams ...


In [3]:
# Load the main training data into memory
train = pd.read_csv("/home/dfsnow/rbans/data/main_data_sample.csv")

In [None]:
# Shuffle the order of the training data
train = shuffle(train)

In [None]:
# Save the shuffled, preprocessed data to disk
train.to_pickle("/home/dfsnow/rbans/data/main_data_shuffled.pickle")

In [None]:
# Preprocess all of the comment bodies
train.body.progress_map(ekphrasis_processor.pre_process_doc)

In [None]:
# Save the preprocessed data to disk
train.to_csv("/home/dfsnow/rbans/data/main_data_shuffled_preprocessed.csv")

In [None]:
vocab = Counter(flatten([(ekphrasis_tokenizer(body)) for body in train[1:500].body])).most_common(500)

In [None]:
# Split hate and nonhate datasets into train, test, and validate
train, test = train_test_split(train, test_size=0.2, random_state=2) 
train, validate = train_test_split(train, test_size=0.2, random_state=2) 