In this notebook we'll do further processing of the input text (in the spirit of hyperparameter tuning, rather than cleaning etc).  We'll then build and train a simple RNN classifier.

In [90]:
from __future__ import unicode_literals, print_function

import os
import pickle
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
from sklearn.preprocessing import LabelBinarizer
import numpy as np
import pandas as pd

from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

import utils
import rnn

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
np.random.seed(42)

### Read in data

In [4]:
%%time

filepath = '../data/title-1-True.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 1.52 s, sys: 40 ms, total: 1.56 s
Wall time: 1.56 s


### Limit vocab size, pad sequences, and split data

In [5]:
maxlen = 20        # Max number of tokens in input sequence
topn = 6747        # Keep only the top n words in vocabulary

In [6]:
%%time

# Filter out uncommon words.

df['encoded_text'] = utils.filter_top_words(df['encoded_text'].tolist(), topn)

CPU times: user 160 ms, sys: 44 ms, total: 204 ms
Wall time: 162 ms


In [7]:
X = pad_sequences(df['encoded_text'], maxlen=maxlen, value=topn,  
                  padding='post', truncating='post')

In [8]:
lb = LabelBinarizer()
y = lb.fit_transform(df['encoded_domain'])

y.shape

(66658, 32)

In [9]:
# Delete the dataframe, we are done with it for now!
del df

### Define model

This is an important part, so I'll be explicit here rather than hiding things in `utils :)`

In [10]:
embed_dim = 256          # embedding dimension for word vecs
num_gru = 1              # number of GRUs to use in serial
gru_dim = 256            # dimension of GRU layers
gru_activation = 'sigmoid'  # activation function for GRU layer
bidirectional = False    # whether to use bidirectional
dense_dim = 256          # dimensionality of dense layer
dropout = 0.5            # dropout ratio
batch_size = 64          # batch size
validation_split = 0.1   # Fraction of samples to keep out for validation
max_epochs = 50          # maximum number of epochs to run for

In [11]:
# Classes are pretty imbalanced, so let's balance them out

weights = []
for i in lb.classes_:
    weights.append(len(np.where(np.argmax(y, axis=1) == i)[0]))
    
max_weight = max(weights)
weights = [1.*x/max_weight for x in weights]
class_weight = {k: w for k, w in zip(lb.classes_, weights)}

class_weight

{0: 0.0975417378730117,
 1: 0.11055606678059682,
 2: 0.49152096752990665,
 3: 0.014328907585118969,
 4: 0.6015512028394899,
 5: 0.41869330879453137,
 6: 1.0,
 7: 0.2020507427369528,
 8: 0.32023136584724593,
 9: 0.0984619429472854,
 10: 0.5774944130406204,
 11: 0.15958985145260943,
 12: 0.1125279347968976,
 13: 0.06441435519915867,
 14: 0.10148547390561326,
 15: 0.33258840541606416,
 16: 0.15038780070987248,
 17: 0.4878401472328119,
 18: 0.26317865124227685,
 19: 0.19232286052320233,
 20: 0.30971473642697517,
 21: 0.08847114499802813,
 22: 0.16550545550151177,
 23: 0.10674378861574865,
 24: 0.10135401603785986,
 25: 0.6331010911003023,
 26: 0.2681740502169055,
 27: 0.5297752070461417,
 28: 0.2250558695937952,
 29: 0.19666097015906403,
 30: 0.2576574207966347,
 31: 0.08373866175890628}

In [13]:
# Let's save our best current checkpoints, and stop if we haven't improved in 3 iterations w.r.t. val_acc.

model_dir = 'models'
basename = '{}_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(gru_dim, num_gru, embed_dim, dense_dim,  
                dropout, bidirectional, maxlen, topn, batch_size, len(lb.classes_), gru_activation) 
filepath = os.path.join(model_dir, basename + '_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5')

print('saving checkpoints to: {}'.format(filepath))

model_checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, 
                                   save_weights_only=False, mode='auto', period=1)

early_stopping = EarlyStopping(monitor='val_loss', patience=3)

saving checkpoints to: models/256_1_256_256_0.5_False_20_6747_64_32_sigmoid_{epoch:03d}_{val_loss:.5f}_{val_acc:.5f}_titles.h5


In [14]:
model = rnn.get_training_model(topn, embed_dim, dense_dim, gru_dim, num_gru, maxlen, dropout,
                               bidirectional, len(lb.classes_), gru_activation)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 256)           1727488   
_________________________________________________________________
gru_1 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                8224      
Total params: 2,195,488
Trainable params: 2,195,488
Non-trainable params: 0
_________________________________________________________________


### Train!

In [15]:
hist = model.fit(X, y, epochs=max_epochs, validation_split=validation_split,
                 callbacks=[model_checkpoint, early_stopping],
                 class_weight=class_weight)

Train on 59992 samples, validate on 6666 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


### Use model to remove "hints"

#### Start by reloading the data

In [33]:
%%time

# Load the source data frame

filepath = '../data/title-1-True.pkl'

df, keep_stops, min_sents, vocab, w2i, i2w = utils.read_dataset(filepath)

CPU times: user 1.6 s, sys: 32 ms, total: 1.63 s
Wall time: 1.62 s


In [34]:
%%time

# Filter out uncommon words using parameters parsed from model name.

df['encoded_text'] = utils.filter_top_words(df['encoded_text'].tolist(), topn)

CPU times: user 60 ms, sys: 4 ms, total: 64 ms
Wall time: 64.5 ms


In [35]:
# Create input/output data

X = pad_sequences(df['encoded_text'], maxlen=maxlen, value=topn, 
                  padding='post', truncating='post')

Y = df['encoded_domain'].tolist()

#### Identify domains with lots of "suspicious", ie. very high confidence, predictions

In [78]:
%%time

# Get output probabilities for every observation.

Ypred = model.predict(X)

CPU times: user 8.32 s, sys: 384 ms, total: 8.7 s
Wall time: 7.19 s


In [79]:
# Create a column with the highest prediction

df['predicted_prob'] = np.max(Ypred, axis=1)

In [91]:
# Create a column with the predicted domain

df['predicted_domain'] = model.predict_classes(X)



In [92]:
# Pull out high-probability samples

hi = pd.DataFrame(df[df['predicted_prob'] > 0.9])
hi.shape

(14074, 10)

In [93]:
# Look at domains that have the highest probabilities

conf_thresh = 0.10  # threshold for how many high-confidence samples a domain has

counts_hi = pd.DataFrame(hi['domain'].value_counts(), dtype=float) 
counts_all = pd.DataFrame(df['domain'].value_counts())

# Compute the ratio of hi-conf
for row in counts_hi.iterrows():
    counts_hi.ix[row[0]] /= counts_all.ix[row[0]]

# Sort by percent of hi-conf and truncate at threshold
counts_hi = counts_hi.sort_values('domain', ascending=False)
counts_hi = counts_hi[counts_hi['domain'] > conf_thresh]
counts_hi

Unnamed: 0,domain
ap.org,1.0
americanthinker.com,0.998652
realclearpolitics.com,0.997003
rightwingnews.com,0.990662
breitbart.com,0.591067
westernjournalism.com,0.521429
go.com,0.42158
washingtonpost.com,0.243424
cnn.com,0.20376


#### Load the checkpoint you want to use and split it to allow sequential predictions

In [87]:
%%time

model_name = 'models/256_1_256_256_0.5_False_20_6747_64_32_sigmoid_003_0.72958_0.40549_titles.h5'

# Get model params from name
gru_dim, num_gru, embed_dim, dense_dim, dropout, bidirectional, maxlen, topn, batch_size, output_size, gru_activation = \
  utils.parse_model_name(model_name)

# Load the keras model
model = load_model(model_name)

CPU times: user 1.48 s, sys: 820 ms, total: 2.3 s
Wall time: 1.44 s


In [88]:
# Split the model so you can predict token by token

in_model, out_model = rnn.split_model_layers(model, topn, embed_dim, dense_dim, gru_dim, num_gru, maxlen, 
  output_size, bidirectional, gru_activation)

#### Use the model to identify tokens used to "cheat" and create blacklist

In [105]:
def get_hival_tokens(domain, conf_thresh=0.9, token_thresh=0.9):
    ''' Returns a dictionary of tokens that occur almost always in a specific domain.
    These are considered hints that the model is using to solve the bias classification
    problem, and should be removed.
    
    I'm being a little sloppy here and pulling X df, *_model from global scope :P
    
    Args:
        domain_thresh: threshold above which a domain prediction is considered high-confidence
        token_thresh: threshold for considering a token to have a high confidence
    '''

    # Extract the specified domain
    hi = pd.DataFrame(df[df['domain'] == domain])

    # Only consider rows where the prediction was correct, i.e., cheating may have happened
    hi = pd.DataFrame(hi[hi['encoded_domain'] == hi['predicted_domain']])
    
    # Select only high-confidence samples
    hi = pd.DataFrame(hi[hi['predicted_prob'] > conf_thresh])
  
    # For each observation
    tokens = defaultdict(float)
    for row in hi.iterrows():
        idx = np.where(df.index == row[0])[0][0]
        P = sequential_pred_for_class(X, df, idx, in_model, out_model)
        T = df['tokenized'].iloc[idx].split()
        hival_idx = np.where(P > token_thresh)[0]
        for i in hival_idx:
            if i >= len(T):
                break
            tokens[T[i]] += 1
  
    # Normalize the token counts as probabilities
    for k in tokens:
        tokens[k] = 1. * tokens[k] / hi.shape[0]
  
    return hi, tokens

def create_blacklist(conf_thresh=0.2, domain_prob=0.9, token_thresh=0.9, count_thresh=0.2):
    '''This function returns a blacklist dictionary indicating which tokens are allowing
    the model to cheat for each domain.
    
    Args:
        conf_thresh: threshold for considering a prediction high-confidence
        domain_prob: threshold for considering a domain to have a high number of hi-conf predictions
        token_thresh: threshold for fraction of articles containing token
        count_thresh: threshold for number of times a token must 
    '''
    
    # Get rows corresponding to high predicted probability
    hi = pd.DataFrame(df[df['predicted_prob'] > conf_thresh])

    # Get fraction of high-confidence samples per domain
    counts_hi = pd.DataFrame(hi['domain'].value_counts(), dtype=float)
    counts_all = pd.DataFrame(df['domain'].value_counts())
    for row in counts_hi.iterrows():
        counts_hi.ix[row[0]] /= counts_all.ix[row[0]]

    # Sort the count values by domain
    counts_hi = counts_hi.sort_values('domain', ascending=False)
    counts_hi = counts_hi[counts_hi['domain'] > domain_prob]

    # For each domain, identify tokens that give the model high confidence
    blacklist = {}
    for domain in counts_hi.index:
        print(domain)
        hi, tokens = get_hival_tokens(domain, conf_thresh, token_thresh)
        counts = [(v, k) for k,v in tokens.iteritems()]
        blacklist[domain] = [x[1] for x in counts if x[0] >= count_thresh]
        print('  {}'.format(blacklist[domain]))

    return blacklist

In [107]:
blacklist = create_blacklist()

americanthinker.com
  [u'article_NOUN', u'the_DET', u'of_ADP']
ap.org
  [u'press_PROPN', u'associated_PROPN']
realclearpolitics.com
  [u'realclearpolitic_NOUN']
rightwingnews.com
  [u'news_PROPN', u'right_PROPN', u'hawkins_PROPN', u'wing_PROPN', u'john_PROPN']
cnn.com
  [u'cnn_PROPN', u'video_PROPN']
breitbart.com
  [u'breitbart_PROPN']
nytimes.com
  []
westernjournalism.com
  [u'-PRON-_PRON']
wsj.com
  []
theatlantic.com
  []
washingtonpost.com
  [u'-PRON-_PRON', u'opinion_NOUN', u'the_DET', u'a_DET', u'trump_PROPN', u'be_VERB']
usatoday.com
  []
newsmax.com
  []
go.com
  [u'in_ADP', u'trump_PROPN']
cnbc.com
  [u'be_VERB']
weeklystandard.com
  []


In [108]:
blacklist

{u'americanthinker.com': [u'article_NOUN', u'the_DET', u'of_ADP'],
 u'ap.org': [u'press_PROPN', u'associated_PROPN'],
 u'breitbart.com': [u'breitbart_PROPN'],
 u'cnbc.com': [u'be_VERB'],
 u'cnn.com': [u'cnn_PROPN', u'video_PROPN'],
 u'go.com': [u'in_ADP', u'trump_PROPN'],
 u'newsmax.com': [],
 u'nytimes.com': [],
 u'realclearpolitics.com': [u'realclearpolitic_NOUN'],
 u'rightwingnews.com': [u'news_PROPN',
  u'right_PROPN',
  u'hawkins_PROPN',
  u'wing_PROPN',
  u'john_PROPN'],
 u'theatlantic.com': [],
 u'usatoday.com': [],
 u'washingtonpost.com': [u'-PRON-_PRON',
  u'opinion_NOUN',
  u'the_DET',
  u'a_DET',
  u'trump_PROPN',
  u'be_VERB'],
 u'weeklystandard.com': [],
 u'westernjournalism.com': [u'-PRON-_PRON'],
 u'wsj.com': []}

In [115]:
df.head()

Unnamed: 0,title,label,url,domain,tokenized,encoded_text,encoded_domain,encoded_label,predicted_prob,predicted_domain
0,CNN/ORC Poll: Most Americans Want Washington C...,conservative,http://www.newsmax.com/Politics/Americans-comp...,newsmax.com,poll_NOUN most_ADJ americans_PROPN want_VERB w...,"[458, 672, 198, 69, 130]",15,0,0.35401,6
1,Marines' Nude Photo Scandal Goes Beyond That O...,liberal,http://www.huffingtonpost.com/entry/marines-nu...,huffingtonpost.com,marines_PROPN nude_PROPN photo_PROPN scandal_N...,"[1738, 1936, 415, 56, 2032, 116, 87, 507, 791,...",11,1,0.165722,6
2,"Man Survives 1,500-Foot Fall off Mountain - Br...",conservative,http://www.breitbart.com/big-government/2017/0...,breitbart.com,man_NOUN survive_VERB fall_PROPN off_ADP mount...,"[51, 1218, 2506, 374, 5544, 29]",2,0,0.999998,2
3,GOP health-care bill would drop addiction trea...,liberal,https://www.washingtonpost.com/news/wonk/wp/20...,washingtonpost.com,gop_PROPN health_NOUN care_NOUN bill_NOUN woul...,"[47, 72, 89, 82, 131, 305, 3865, 2176, 4881, 1...",27,1,0.221614,6
4,Mansfield Timberview tops Corpus Christi Memor...,conservative,http://www.washingtontimes.com/news/2017/mar/9...,washingtontimes.com,mansfield_PROPN timberview_PROPN top_VERB corp...,"[2808, 5153, 4995, 4183, 2]",28,0,0.426671,4


#### Filter out cheat words and re-encode text

In [207]:
#dg = df.copy()
df = dg.copy()

In [210]:
for domain in blacklist.keys():
    domain_index = df[df['domain'] == domain].index
    data = df.ix[domain_index, 'tokenized'].tolist()
    for ix, d in enumerate(data):
        data[ix] = ' '.join([x for x in d.split() if x not in blacklist[domain]])
    df.loc[domain_index, 'tokenized'] = data

In [212]:
df['encoded_text'] = df['tokenized'].map(lambda x: [w2i[y] for y in x.split()])

#### Write the data

In [218]:
%%time

OUTPUT_FILE = '../data/title-{}-{}-clean.pkl'.format(min_sents, keep_stops)

_ = utils.write_dataset(OUTPUT_FILE, df, keep_stops, min_sents, vocab, w2i, i2w)

wrote to ../data/title-1-True-clean.pkl
CPU times: user 1.69 s, sys: 16 ms, total: 1.71 s
Wall time: 1.7 s
