In [1]:
import paths

from tf_custom_models import OneLayerNN
from utility import train_and_eval_auc, HATEBASE_FIELDS
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

import os
from os.path import join as pjoin

from tqdm import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
import json
import itertools

In [2]:
logging.basicConfig(level=logging.INFO)

FLAGS = tf.app.flags.FLAGS

# tf.app.flags.DEFINE_float("learning_rate", 0.001, "Learning rate.")
# tf.app.flags.DEFINE_float("dropout", 0.15, "Fraction of units randomly dropped on non-recurrent connections.")
# tf.app.flags.DEFINE_integer("batch_size", 10, "Batch size to use during training.")
# tf.app.flags.DEFINE_integer("epochs", 0, "Number of epochs to train.")
tf.app.flags.DEFINE_integer("state_size", 50, "Size of hidden layer.")
tf.app.flags.DEFINE_integer("embedding_size", 100, "Size of the pretrained vocabulary. (default 100)")
tf.app.flags.DEFINE_string("data_dir", "../data/hatebase", "Hatebase directory (default ../data/hatebase)")
tf.app.flags.DEFINE_string("vocab_path", "../data/twitter_davidson/vocab.dat", "Path to vocab file (default: ../data/twitter_davidson/vocab.dat)")
tf.app.flags.DEFINE_boolean("force_load_embeddings", False, "Force loading new hatebase embeddings")

In [7]:
unfound_i = []

In [19]:
def load_embeddings(embed_path, vocab, force=False):
    GLOVE_SIZE = 1193514
    GLOVE_PATH = "../data/glove/glove.twitter.27B.%dd.txt" % FLAGS.embedding_size

    if force or not os.path.exists(embed_path):
        hb_vecs = np.zeros((len(vocab), FLAGS.embedding_size))
        with open(GLOVE_PATH, 'r') as fh:
            found = []
            for line in tqdm(fh, total=GLOVE_SIZE):
                array = line.strip().split(" ")
                word = array[0]
                if word in vocab:
                    idx = vocab[word]
                    found.append(idx)
                    vector = list(map(float, array[1:]))
                    hb_vecs[idx, :] = vector
            # words not found are set to average of other words
            avg = hb_vecs[found, :].mean(axis=0)
            unfound = list(set(vocab.values()) - set(found))
            hb_vecs[unfound, :] = avg
        hb_vecs = pd.DataFrame(hb_vecs)
        hb_vecs.to_csv(embed_path, header = False, index = False)
        return hb_vecs, unfound

    with open(embed_path, 'rb') as embed_path:
        data_x = pd.read_csv( embed_path, header = None, quoting = 0, dtype = np.float32 )
        return data_x, unfound_i

In [4]:
def get_compare_embeddings(original_embeddings, tuned_embeddings, vocab, dimreduce_type="pca", random_state=0):
    """ Compare embeddings drift. """
    if dimreduce_type == "pca":
        from sklearn.decomposition import PCA
        dimreducer = PCA(n_components=2, random_state=random_state)
    elif dimreduce_type == "tsne":
        from sklearn.manifold import TSNE
        dimreducer = TSNE(n_components=2, random_state=random_state)
    else:
        raise Exception("Wrong dimreduce_type.")

    reduced_original = dimreducer.fit_transform(original_embeddings)
    reduced_tuned = dimreducer.fit_transform(tuned_embeddings)

    def compare_embeddings(word):
        if word not in vocab:
            return None
        word_id = vocab[word]
        original_x, original_y = reduced_original[word_id, :]
        tuned_x, tuned_y = reduced_tuned[word_id, :]
        return original_x, original_y, tuned_x, tuned_y

    return compare_embeddings

In [14]:
def print_embeddings(embeddings_list, vocab):
    '''Takes list of embeddings that have the same indices.
    Each set of embeddings will be plotted in a different color.'''
    tsne = TSNE(n_components=2, random_state=0)
    pca = PCA(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    colors = itertools.cycle(["r", "b", "g"])

    for wv in embeddings_list:
        Y = pca.fit_transform(wv)
     
        plt.scatter(Y[:, 0], Y[:, 1], color=next(colors))
        for label, x, y in zip(vocab, Y[:, 0], Y[:, 1]):
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.show()

In [None]:
# main function
embed_path = pjoin(FLAGS.data_dir, "embeddings.%dd.vec") % FLAGS.embedding_size
hb_path = pjoin(FLAGS.data_dir, "lexicon.csv")

hatebase_data = pd.read_csv( hb_path, header = 0, index_col = 0, quoting = 0, 
                                dtype = HATEBASE_FIELDS, usecols = range(9) )
vocab = dict([(x, y) for (y, x) in enumerate(hatebase_data.index)])
hatebase_embeddings, unfound_i = load_embeddings(embed_path, vocab, True)

train_i, test_i = train_test_split( np.arange( len( hatebase_embeddings )), train_size = 0.9, random_state = 44 )
train_x = hatebase_embeddings.ix[train_i]
test_x = hatebase_embeddings.ix[test_i]
train_y = hatebase_data.ix[train_i]
test_y = hatebase_data.ix[test_i]

nn = OneLayerNN()
#train_and_eval_auc( train_x, train_y, test_x, test_y, model=nn )
nn.fit( hatebase_embeddings, hatebase_data )
hidden_states = nn.return_hidden_states( hatebase_embeddings )

 51%|█████▏    | 613937/1193514 [00:06<00:06, 93985.27it/s]

In [25]:
# graph stuff
#cmpr_fn = get_compare_embeddings(hatebase_embeddings, hidden_states, vocab)
#print_embeddings( [hatebase_embeddings.values, hidden_states], vocab, 50 )
unfound_vocab = [hatebase_data.index[i] for i in unfound_i]
print_embeddings( [hatebase_embeddings.ix[unfound_i], hidden_states[unfound_i, :]], unfound_vocab )

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [24]:
print [hatebase_data.index[i] for i in unfound_i]

['bluegum', 'neechee', 'jockie', 'russellite ', 'mud shark', 'zionazi', 'kotiya', 'papist', 'paleface', 'pineapple nigger', 'dune nigger', 'stump jumper', 'shyster', 'fuzzy wuzzy', 'darkey', 'towel head', 'honkie', 'bounty bar', 'kushite', 'whore from fife', 'mackerel snapper ', 'nitchee', 'conspiracy theorist', 'black dago', 'sawney', 'moss eater', 'heinie', 'slopehead', 'zippohead', 'gooky', 'rico suave', 'lugan', 'island nigger', 'carpet pilot', 'hymie', 'cocoa puff', 'eight ball', 'timber nigger', 'buddhahead', 'sideways pussy', 'spice nigger', 'smoke jumper', 'hayseed', 'banjo lips', 'eh hole', 'aunt jemima', 'carrot snapper', 'powderburn', 'mangia cake', 'gurrier', 'gator bait', 'dhimmi', 'bitter clinger', 'muzzie', 'shanty irish', 'aunt sally', 'moulie', 'jungle bunny', 'gippo', 'chonky', 'diaper head', 'cowboy killer', 'americoon', 'tynkere', 'scag', 'nig nog', 'proddy dog', 'octroon', 'wexican', 'ching chong', 'charver', 'lowlander', 'clamhead', 'bamboo coon', 'camel cowboy', 