This file tests the one layer NN vs a direct linear classifier using GloVe vectors to predict the 8 hatebase features. We define a new cost function that is similar to AUC but deals with the 8 features separately.

In [1]:
from tf_custom_models import OneLayerNN, SoftmaxClassifier
from utility import train_and_eval_auc, HATEBASE_FIELDS
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import roc_auc_score as AUC

import matplotlib.pyplot as plt

import os
from os.path import join as pjoin

from tqdm import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
import json
import itertools

In [26]:
DATA_DIR = "data/hatebase"
EMBEDDING_SIZE = 100
STATE_SIZE = 100
GLOVE_SIZE = 1193514
GLOVE_PATH = "data/glove/glove.twitter.27B.%dd.txt" % EMBEDDING_SIZE

EMBED_PATH = "data/hatebase/embeddings.%dd.dat" % EMBEDDING_SIZE
HIDDEN_EMBED_PATH = "data/hatebase/embeddings.hidden.%dd.dat" % EMBEDDING_SIZE
HB_PATH = "data/hatebase/lexicon.csv"
VOCAB_PATH = "data/hatebase/vocab.dat"

In [27]:
def load_embeddings(embed_path, vocab, force=False):
    if not os.path.exists(embed_path):
        hb_vecs = np.zeros((len(vocab), EMBEDDING_SIZE))
        with open(GLOVE_PATH, 'r') as fh:
            found = []
            for line in tqdm(fh, total=GLOVE_SIZE):
                array = line.strip().split(" ")
                word = array[0]
                if word in vocab:
                    idx = vocab[word]
                    found.append(idx)
                    vector = list(map(float, array[1:]))
                    hb_vecs[idx, :] = vector
            # words not found are set to random values
            unfound = list(set(vocab.values()) - set(found))
            for i in unfound:
                hb_vecs[i, :] = np.random.randn(EMBEDDING_SIZE)
                
        hb_vecs = pd.DataFrame(hb_vecs)
        hb_vecs.to_csv(embed_path, header = False, index = False)
        return hb_vecs

    with open(embed_path, 'rb') as embed_path:
        data_x = pd.read_csv( embed_path, header = None, quoting = 0, dtype = np.float32 )
        return data_x

In [4]:
# grab the data
hatebase_data = pd.read_csv( HB_PATH, header = 0, index_col = 0, quoting = 0, 
                                dtype = HATEBASE_FIELDS, usecols = range(9) )
vocab = dict([(x, y) for (y, x) in enumerate(hatebase_data.index)])
hatebase_embeddings = load_embeddings(EMBED_PATH, vocab, True)

train_i, test_i = train_test_split( np.arange( len( hatebase_embeddings )), train_size = 0.8, random_state = 44 )
train_x = hatebase_embeddings.iloc[train_i]
test_x = hatebase_embeddings.iloc[test_i]
train_y = hatebase_data.iloc[train_i]
test_y = hatebase_data.iloc[test_i]



In [11]:
def total_jaccard( train_x, train_y, test_x, test_y, model ):
    model.fit( train_x, train_y )
    p = model.predict( test_x )
    #print p
    p = (p >= 0.5).astype(float)
    total = sum([jaccard_similarity_score(y_true, y_pred) for y_true, y_pred in zip(test_y, p)])
    print "Total Jaccard similarity:", total/len(test_x)

In [6]:
def train_and_eval_auc( train_x, train_y, test_x, test_y, model ):
    model.fit( train_x, train_y )
    p = model.predict_proba( test_x )
    p = p[:,1] if p.shape[1] > 1 else p[:,0]

    auc = AUC( test_y, p )
    print "AUC:", auc

In [13]:
tf.reset_default_graph()
nn = OneLayerNN()
nn.fit( hatebase_embeddings, hatebase_data )
hidden_states = nn.return_hidden_states( hatebase_embeddings )

Iteration 1000: loss: 0.161815926433 

In [23]:
# write hidden states
hidden_states = pd.DataFrame(hidden_states)
hidden_states.to_csv(HIDDEN_EMBED_PATH, header = False, index = False)

           0         1         2         3         4         5         6   \
0    0.456425  0.414027  0.604445  0.572907  0.432509  0.439992  0.494656   
1    0.575999  0.455576  0.908471  0.449966  0.684535  0.407860  0.488815   
2    0.493667  0.476726  0.282094  0.735881  0.467950  0.133803  0.946229   
3    0.311967  0.619841  0.234967  0.085983  0.134568  0.138706  0.719238   
4    0.529106  0.495026  0.551225  0.625327  0.552845  0.330481  0.562362   
5    0.286733  0.408641  0.223853  0.677007  0.429127  0.423491  0.435312   
6    0.662635  0.467512  0.368308  0.603700  0.416271  0.715217  0.264777   
7    0.527670  0.726104  0.595309  0.514192  0.426509  0.479006  0.489968   
8    0.589656  0.618188  0.280485  0.339683  0.666604  0.244354  0.482003   
9    0.374517  0.616019  0.730836  0.519205  0.365687  0.601393  0.548884   
10   0.552217  0.590203  0.656959  0.602282  0.625810  0.339995  0.580408   
11   0.180886  0.732642  0.722418  0.650592  0.513026  0.448571  0.597469   

In [28]:
with open(VOCAB_PATH, mode="wb") as vocab_file:
    for w in hatebase_data.index.values:
        vocab_file.write(w + b"\n")

In [12]:
tf.reset_default_graph()
nn = OneLayerNN()
total_jaccard( train_x, train_y.iloc[:,:7], test_x, test_y.iloc[:,:7].values, nn )

Iteration 1000: loss: 0.0911179706454 

Total Jaccard similarity: 0.793650793651


In [23]:
for i, field in enumerate(HATEBASE_FIELDS):
    print field
    tf.reset_default_graph()
    train_and_eval_auc( train_x, train_y.iloc[:,i], test_x, test_y.iloc[:,i], OneLayerNN() )

about_class


Iteration 1000: loss: 0.184165582061 

AUC: 0.472934472934
about_ethnicity


Iteration 1000: loss: 0.0597166158259 

AUC: 0.564257028112
about_sexual_orientation


Iteration 1000: loss: 0.0233292710036 

AUC: 0.474358974359
about_religion


Iteration 1000: loss: 0.160992875695 

AUC: 0.525017618041
about_disability


Iteration 1000: loss: 0.168781414628 

AUC: 0.554943373125
about_gender


Iteration 1000: loss: 0.0206541772932 

AUC: 0.585034013605
about_nationality


Iteration 1000: loss: 0.0504829958081 

AUC: 0.578034682081
offensiveness


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [60]:
lr = SoftmaxClassifier()

In [59]:
total_jaccard( train_x, train_y.iloc[:,:7], test_x, test_y.iloc[:,:7].values, lr )

Iteration 1000: loss: 1.18320953846 

Total Jaccard similarity: 0.869352869353


In [22]:
for i, field in enumerate(HATEBASE_FIELDS):
    print field
    train_and_eval_auc( train_x, train_y.iloc[:,i], test_x, test_y.iloc[:,i], SoftmaxClassifier() )

about_class


Iteration 1000: loss: 0.245341107249 

AUC: 0.459164292498
about_ethnicity


Iteration 1000: loss: 0.192264601588 

AUC: 0.70749665328
about_sexual_orientation


Iteration 1000: loss: 0.0931176915765 

AUC: 0.65483234714
about_religion


Iteration 1000: loss: 0.448324710131 

AUC: 0.570472163495
about_disability


Iteration 1000: loss: 0.430169701576 

AUC: 0.555096418733
about_gender


Iteration 1000: loss: 0.085390098393 

AUC: 0.296768707483
about_nationality


Iteration 1000: loss: 0.0770960450172 

AUC: 0.335260115607
offensiveness


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

Note: boundary for vectors passed to jaccard sim is 0.5

One Layer NN:

`
On 80-20 split
Iteration 1000: loss: 0.0963332206011 
Total Jaccard similarity: 0.862026862027
`


`about_class
Iteration 1000: loss: 0.184165582061 
AUC: 0.472934472934
about_ethnicity
Iteration 1000: loss: 0.0597166158259 
AUC: 0.564257028112
about_sexual_orientation
Iteration 1000: loss: 0.0233292710036 
AUC: 0.474358974359
about_religion
Iteration 1000: loss: 0.160992875695 
AUC: 0.525017618041
about_disability
Iteration 1000: loss: 0.168781414628 
AUC: 0.554943373125
about_gender
Iteration 1000: loss: 0.0206541772932 
AUC: 0.585034013605
about_nationality
Iteration 1000: loss: 0.0504829958081 
AUC: 0.578034682081
`

Softmax:

`
On 80-20 split
Iteration 1000: loss: 1.18320953846 
Total Jaccard similarity: 0.869352869353
`


`about_class
Iteration 1000: loss: 0.245341107249 
AUC: 0.459164292498
about_ethnicity
Iteration 1000: loss: 0.192264601588 
AUC: 0.70749665328
about_sexual_orientation
Iteration 1000: loss: 0.0931176915765 
AUC: 0.65483234714
about_religion
Iteration 1000: loss: 0.448324710131 
AUC: 0.570472163495
about_disability
Iteration 1000: loss: 0.430169701576 
AUC: 0.555096418733
about_gender
Iteration 1000: loss: 0.085390098393 
AUC: 0.296768707483
about_nationality
Iteration 1000: loss: 0.0770960450172 
AUC: 0.335260115607
`