In this notebook we'll use an RNN to identify domain-specific tokens that allow the model to cheat, and
remove them.

In [1]:
from __future__ import unicode_literals, print_function

import os
import pickle
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
from sklearn.preprocessing import LabelBinarizer
import numpy as np
import pandas as pd

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

import utils
import rnn

%load_ext autoreload

Using TensorFlow backend.


In [2]:
np.random.seed(42)

### Read in data

In [3]:
%%time

filepath = '../data/title-1-True.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 1.52 s, sys: 28 ms, total: 1.54 s
Wall time: 1.54 s


### Limit vocab size, pad sequences, and split data

In [8]:
maxlen = 20        # Max number of tokens in input sequence
topn = 6747        # Keep only the top n words in vocabulary

In [5]:
%%time

# Filter out uncommon words.

df['encoded_text'] = utils.filter_top_words(df['encoded_text'].tolist(), topn)

CPU times: user 152 ms, sys: 8 ms, total: 160 ms
Wall time: 153 ms


In [6]:
X = pad_sequences(df['encoded_text'], maxlen=maxlen, value=topn,  
                  padding='post', truncating='post')

In [None]:
lb = LabelBinarizer()
y = lb.fit_transform(df['encoded_domain'])

y.shape

In [None]:
# Delete the dataframe, we are done with it for now!
del df

### Define model

This is an important part, so I'll be explicit here rather than hiding things in `utils :)`

In [10]:
embed_dim = 256          # embedding dimension for word vecs
num_gru = 1              # number of GRUs to use in serial
gru_dim = 256            # dimension of GRU layers
gru_activation = 'sigmoid'  # activation function for GRU layer
bidirectional = False    # whether to use bidirectional
dense_dim = 256          # dimensionality of dense layer
dropout = 0.5            # dropout ratio
batch_size = 64          # batch size
validation_split = 0.1   # Fraction of samples to keep out for validation
max_epochs = 50          # maximum number of epochs to run for

In [11]:
# Classes are pretty imbalanced, so let's balance them out

weights = []
for i in lb.classes_:
    weights.append(len(np.where(np.argmax(y, axis=1) == i)[0]))
    
max_weight = max(weights)
weights = [1.*x/max_weight for x in weights]
class_weight = {k: w for k, w in zip(lb.classes_, weights)}

class_weight

{0: 0.0975417378730117,
 1: 0.11055606678059682,
 2: 0.49152096752990665,
 3: 0.014328907585118969,
 4: 0.6015512028394899,
 5: 0.41869330879453137,
 6: 1.0,
 7: 0.2020507427369528,
 8: 0.32023136584724593,
 9: 0.0984619429472854,
 10: 0.5774944130406204,
 11: 0.15958985145260943,
 12: 0.1125279347968976,
 13: 0.06441435519915867,
 14: 0.10148547390561326,
 15: 0.33258840541606416,
 16: 0.15038780070987248,
 17: 0.4878401472328119,
 18: 0.26317865124227685,
 19: 0.19232286052320233,
 20: 0.30971473642697517,
 21: 0.08847114499802813,
 22: 0.16550545550151177,
 23: 0.10674378861574865,
 24: 0.10135401603785986,
 25: 0.6331010911003023,
 26: 0.2681740502169055,
 27: 0.5297752070461417,
 28: 0.2250558695937952,
 29: 0.19666097015906403,
 30: 0.2576574207966347,
 31: 0.08373866175890628}

In [13]:
# Let's save our best current checkpoints, and stop if we haven't improved in 3 iterations w.r.t. val_acc.

model_dir = 'models'
basename = '{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(gru_dim, num_gru, embed_dim, dense_dim,  
                dropout, bidirectional, maxlen, topn, batch_size, len(lb.classes_), gru_activation) 
filepath = os.path.join(model_dir, basename + '_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5')

print('saving checkpoints to: {}'.format(filepath))

model_checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, 
                                   save_weights_only=False, mode='auto', period=1)

early_stopping = EarlyStopping(monitor='val_loss', patience=3)

saving checkpoints to: models/256_1_256_256_0.5_False_20_6747_64_32_sigmoid_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5


In [14]:
model = rnn.get_training_model(topn, embed_dim, dense_dim, gru_dim, num_gru, maxlen, dropout,
                               bidirectional, len(lb.classes_), gru_activation)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 256)           1727488   
_________________________________________________________________
gru_1 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                8224      
Total params: 2,195,488
Trainable params: 2,195,488
Non-trainable params: 0
_________________________________________________________________


### Train!

In [15]:
hist = model.fit(X, y, epochs=max_epochs, validation_split=validation_split,
                 callbacks=[model_checkpoint, early_stopping],
                 class_weight=class_weight)

Train on 59992 samples, validate on 6666 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


### Use model to remove "hints"

#### Start by reloading the data

In [6]:
%%time

# Load the source data frame

filepath = '../data/title-1-True.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 1.55 s, sys: 20 ms, total: 1.57 s
Wall time: 1.58 s


In [9]:
%%time

# Filter out uncommon words using parameters parsed from model name.

df['encoded_text'] = utils.filter_top_words(df['encoded_text'].tolist(), topn)

CPU times: user 128 ms, sys: 12 ms, total: 140 ms
Wall time: 121 ms


In [10]:
# Create input/output data

X = pad_sequences(df['encoded_text'], maxlen=maxlen, value=topn, 
                  padding='post', truncating='post')

Y = df['encoded_domain'].tolist()

In [11]:
%%time

model_name = 'models/256_1_256_256_0.5_False_20_6747_64_32_sigmoid_003_0.72958_0.40549_titles.h5'

# Get model params from name
gru_dim, num_gru, embed_dim, dense_dim, dropout, bidirectional, maxlen, topn, batch_size, output_size, gru_activation = \
  utils.parse_model_name(model_name)

# Load the keras model
model = load_model(model_name)


CPU times: user 1.24 s, sys: 612 ms, total: 1.85 s
Wall time: 1.3 s


In [12]:
# Split the model so you can predict token by token

in_model, out_model = rnn.split_model_layers(model, topn, embed_dim, dense_dim, gru_dim, num_gru, maxlen, 
  output_size, bidirectional, gru_activation)


#### Identify domains with lots of "suspicious", ie. very high confidence, predictions

In [13]:
# Get output probabilities for every observation.

Ypred = model.predict(X)

In [14]:
# Create a column with the highest prediction

df['predicted_prob'] = np.max(Ypred, axis=1)

In [15]:
# Create a column with the predicted domain

df['predicted_domain'] = model.predict_classes(X)



In [16]:
# Pull out high-probability samples

hi = pd.DataFrame(df[df['predicted_prob'] > 0.8])
hi.shape

(16204, 10)

In [17]:
# Look at domains that have the highest probabilities

conf_thresh = 0.10  # threshold for how many high-confidence samples a domain has

counts_hi = pd.DataFrame(hi['domain'].value_counts(), dtype=float) 
counts_all = pd.DataFrame(df['domain'].value_counts())

# Compute the ratio of hi-conf
for row in counts_hi.iterrows():
    counts_hi.ix[row[0]] /= counts_all.ix[row[0]]

# Sort by percent of hi-conf and truncate at threshold
counts_hi = counts_hi.sort_values('domain', ascending=False)
counts_hi = counts_hi[counts_hi['domain'] > conf_thresh]
counts_hi

Unnamed: 0,domain
ap.org,1.0
americanthinker.com,0.998652
realclearpolitics.com,0.997003
rightwingnews.com,0.990662
westernjournalism.com,0.619388
breitbart.com,0.605777
go.com,0.428636
cnn.com,0.318785
washingtonpost.com,0.266253
nytimes.com,0.12665


#### Use the model to identify tokens used to "cheat" and create blacklist

In [63]:
def get_hival_tokens(domain, conf_thresh=0.9, token_thresh=0.9):
    ''' Returns a dictionary of tokens that occur almost always in a specific domain.
    These are considered hints that the model is using to solve the bias classification
    problem, and should be removed.
    
    I'm being a little sloppy here and pulling X df, *_model from global scope :P
    
    Args:
        domain_thresh: threshold above which a domain prediction is considered high-confidence
        token_thresh: threshold for considering a token to have a high confidence
    '''

    # Extract the specified domain
    hi = pd.DataFrame(df[df['domain'] == domain])

    # Only consider rows where the prediction was correct, i.e., cheating may have happened
    hi = pd.DataFrame(hi[hi['encoded_domain'] == hi['predicted_domain']])
    
    # Select only high-confidence samples
    hi = pd.DataFrame(hi[hi['predicted_prob'] > conf_thresh])
  
    # For each observation
    tokens = defaultdict(float)
    for row in hi.iterrows():
        idx = np.where(df.index == row[0])[0][0]
        P = rnn.sequential_pred_for_class(X, df, idx, in_model, out_model)
        T = df['tokenized'].iloc[idx].split()
        hival_idx = np.where(P > token_thresh)[0]
        for i in hival_idx:
            if i >= len(T):
                break
            tokens[T[i]] += 1
  
    # Normalize the token counts as probabilities
    for k in tokens:
        tokens[k] = 1. * tokens[k] / hi.shape[0]
  
    return hi, tokens

def create_blacklist(conf_thresh=0.2, domain_prob=0.9, token_thresh=0.9, count_thresh=0.1):
    '''This function returns a blacklist dictionary indicating which tokens are allowing
    the model to cheat for each domain.
    
    Args:
        conf_thresh: threshold for considering a prediction high-confidence
        domain_prob: threshold for considering a domain to have a high number of hi-conf predictions
        token_thresh: threshold for fraction of articles containing token
        count_thresh: threshold for number of times a token must 
    '''
    
    # Get rows corresponding to high predicted probability
    hi = pd.DataFrame(df[df['predicted_prob'] > conf_thresh])

    # Get fraction of high-confidence samples per domain
    counts_hi = pd.DataFrame(hi['domain'].value_counts(), dtype=float)
    counts_all = pd.DataFrame(df['domain'].value_counts())
    for row in counts_hi.iterrows():
        counts_hi.ix[row[0]] /= counts_all.ix[row[0]]

    # Sort the count values by domain
    counts_hi = counts_hi.sort_values('domain', ascending=False)
    counts_hi = counts_hi[counts_hi['domain'] > domain_prob]

    # For each domain, identify tokens that give the model high confidence
    blacklist = {}
    for domain in counts_hi.index:
        print(domain)
        hi, tokens = get_hival_tokens(domain, conf_thresh, token_thresh)
        counts = [(v, k) for k, v in tokens.iteritems()]
        blacklist[domain] = [x[1] for x in counts if x[0] >= count_thresh]
        print('  {}'.format(blacklist[domain]))

    return blacklist

In [65]:
blacklist = create_blacklist(count_thresh=0.05, token_thresh=0.85)

americanthinker.com
  [u'and_CCONJ', u'in_ADP', u'-PRON-_PRON', u'article_NOUN', u'the_DET', u'a_DET', u'of_ADP', u'for_ADP', u'trump_PROPN', u'on_ADP', u'to_ADP', u'be_VERB', u'to_PART', u'obama_PROPN']
ap.org
  [u'press_PROPN', u'associated_PROPN']
realclearpolitics.com
  [u'realclearpolitics_PROPN', u'realclearpolitic_NOUN']
rightwingnews.com
  [u'news_PROPN', u'right_PROPN', u'hawkins_PROPN', u'wing_PROPN', u'john_PROPN']
cnn.com
  [u'video_PROPN', u'cnn_PROPN']
breitbart.com
  [u'breitbart_PROPN']
nytimes.com
  []
westernjournalism.com
  [u'with_ADP', u'in_ADP', u'-PRON-_PRON', u'do_VERB', u'news_PROPN', u'the_DET', u'instantly_ADV', u'announcement_NOUN', u'trump_PROPN', u'to_ADP', u'-PRON-_ADJ', u'be_VERB', u'just_ADV']
wsj.com
  []
theatlantic.com
  []
washingtonpost.com
  [u'and_CCONJ', u'in_ADP', u'-PRON-_PRON', u'opinion_NOUN', u'the_DET', u'a_DET', u'of_ADP', u'for_ADP', u'perspective_NOUN', u'trump_PROPN', u'-PRON-_ADJ', u'be_VERB', u'to_PART']
usatoday.com
  []
newsmax.com

In [66]:
blacklist

{u'americanthinker.com': [u'and_CCONJ',
  u'in_ADP',
  u'-PRON-_PRON',
  u'article_NOUN',
  u'the_DET',
  u'a_DET',
  u'of_ADP',
  u'for_ADP',
  u'trump_PROPN',
  u'on_ADP',
  u'to_ADP',
  u'be_VERB',
  u'to_PART',
  u'obama_PROPN'],
 u'ap.org': [u'press_PROPN', u'associated_PROPN'],
 u'breitbart.com': [u'breitbart_PROPN'],
 u'cnbc.com': [],
 u'cnn.com': [u'video_PROPN', u'cnn_PROPN'],
 u'go.com': [u'and_CCONJ',
  u'with_ADP',
  u'in_ADP',
  u'the_DET',
  u'after_ADP',
  u'a_DET',
  u'of_ADP',
  u'video_NOUN',
  u'at_ADP',
  u'for_ADP',
  u'trump_PROPN',
  u'on_ADP',
  u'to_ADP',
  u'-PRON-_ADJ',
  u'be_VERB',
  u'to_PART',
  u'president_PROPN'],
 u'newsmax.com': [],
 u'nytimes.com': [],
 u'realclearpolitics.com': [u'realclearpolitics_PROPN',
  u'realclearpolitic_NOUN'],
 u'rightwingnews.com': [u'news_PROPN',
  u'right_PROPN',
  u'hawkins_PROPN',
  u'wing_PROPN',
  u'john_PROPN'],
 u'theatlantic.com': [],
 u'usatoday.com': [],
 u'washingtonpost.com': [u'and_CCONJ',
  u'in_ADP',
  u'-PR

#### Filter out cheat words and re-encode text

In [1]:
dg = df.copy()
#df = dg.copy()

In [68]:
for domain in blacklist.keys():
    domain_index = df[df['domain'] == domain].index
    data = df.ix[domain_index, 'tokenized'].tolist()
    for ix, d in enumerate(data):
        data[ix] = ' '.join([x for x in d.split() if x not in blacklist[domain]])
    df.loc[domain_index, 'tokenized'] = data

In [69]:
df['encoded_text'] = df['tokenized'].map(lambda x: [w2i[y] for y in x.split()])

#### Write the data

In [70]:
%%time

OUTPUT_FILE = '../data/title-{}-{}-clean.pkl'.format(min_sents, keep_stops)

_ = utils.write_dataset(OUTPUT_FILE, df, keep_stops, min_sents, vocab, w2i, i2w)

wrote to ../data/title-1-True-clean.pkl
CPU times: user 1.8 s, sys: 36 ms, total: 1.84 s
Wall time: 1.83 s
