In [1]:
import os
import sqlite3

import lime.lime_text
import numpy
import pandas

import input_vector_util
import harness_util

Using TensorFlow backend.


In [2]:
PERSIST_PREDICTIONS = False

# Train

In [3]:
harness_factory = harness_util.TemplateHarnessFactory()

In [4]:
config = {
                "corpusCol": "description",
                "denseSize1": 32,
                "denseSize2": 16,
                "dropoutRate": 0,
                "kernelRegPenalty": 0.01,
                "method": "occurrence",
                "numWords": 10000,
                "sourceCol": "source",
                "sourceIdCol": "sourceId",
                "sourceIdVectorCol": "sourceIdVector",
                "tokenVectorCol": "tokenVector",
                "tokensCol": "tokens"
            }

In [5]:
harness = harness_factory.build(config, db_loc=os.path.join('data', 'articles_publish.db'))

In [6]:
results = harness.run('who-wrote-this', 'accept-descr-occ-manual', config)

Instructions for updating:
Colocations handled automatically by placer.
W&B Run: https://app.wandb.ai/sampottinger/who-wrote-this/runs/pk2aypxh
Call `%%wandb` in the cell containing your training loop to display live results.


wandb: Wandb version 0.8.1 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Instructions for updating:
Use tf.cast instead.
Train on 32653 samples, validate on 4008 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Generate predictions

In [7]:
target_frame = results.get_data_frame()
model = results.get_model()

In [8]:
predictions = model.predict(numpy.array(target_frame['tokenVector'].tolist()))

In [9]:
source_mapping = results.get_source_ids()._NumericalSourceIdSet__mapping

In [10]:
source_mapping_invert = {}
for source in source_mapping:
    source_index = source_mapping[source]
    target_frame[source + '_prediction'] = predictions[:,source_index]
    source_mapping_invert[source_index] = source

In [11]:
target_frame['prediction'] = list(map(lambda x: source_mapping_invert[x], numpy.argmax(predictions, axis=1)))

In [12]:
output_frame = pandas.DataFrame()

In [13]:
target_frame.keys()

Index(['source', 'title', 'description', 'set_assignment', 'sourceId',
       'sourceIdVector', 'tokens', 'tokenVector', 'Breitbart_prediction',
       'CNN_prediction', 'Drudge Report_prediction', 'BBC_prediction',
       'Daily Mail_prediction', 'NPR_prediction', 'New York Times_prediction',
       'Vox_prediction', 'Fox_prediction', 'Wall Street Journal_prediction',
       'prediction'],
      dtype='object')

In [14]:
output_frame['title'] = target_frame['title']
output_frame['description'] = target_frame['description']
output_frame['actualSource'] = target_frame['source']
output_frame['setAssignment'] = target_frame['set_assignment']
output_frame['cnnScore'] = target_frame['CNN_prediction']
output_frame['foxScore'] = target_frame['Fox_prediction']
output_frame['dailyMailScore'] = target_frame['Daily Mail_prediction']
output_frame['drudgeReportScore'] = target_frame['Drudge Report_prediction']
output_frame['newYorkTimesScore'] = target_frame['New York Times_prediction']
output_frame['bbcScore'] = target_frame['BBC_prediction']
output_frame['breitbartScore'] = target_frame['Breitbart_prediction']
output_frame['wallStreetJournalScore'] = target_frame['Wall Street Journal_prediction']
output_frame['voxScore'] = target_frame['Vox_prediction']
output_frame['nprScore'] = target_frame['NPR_prediction']
output_frame['prediction'] = target_frame['prediction']

In [15]:
if PERSIST_PREDICTIONS:
    conn = sqlite3.connect('./articles.db')
    output_frame.to_sql('predictions', conn)
    conn.commit()

# Look at Word Importance

In [16]:
class_names = list(map(lambda x: source_mapping_invert[x], sorted(source_mapping_invert.keys())))

In [17]:
word_index = results.get_tokenizer().get_inner_tokenizer().word_index.items()
feature_names_dict = dict(map(reversed, word_index))

In [18]:
max_count =  max(feature_names_dict.keys())
feature_names = [''] * max_count
for i in range(0, max_count):
    feature_names[i] = feature_names_dict.get(i, '')

In [19]:
input_vals = numpy.array(target_frame['tokenVector'].tolist())

In [20]:
explainer = lime.lime_text.LimeTextExplainer(
    class_names=class_names
)

In [21]:
tokenizer = results.get_tokenizer().get_inner_tokenizer()
vectorizer = input_vector_util.OccurenceInputVectorizer()
data_loader = results.get_data_loader()

def predict_from_input_text(input_text, agency_name):
    input_text = map(lambda x: data_loader.clean_input_text(x, agency_name), input_text)
    tokens = tokenizer.texts_to_sequences(input_text)
    vectors = map(lambda x: vectorizer.prepare(10000, x), tokens)
    predict_closure = lambda x: model.predict(numpy.array([x,]))[0]
    probs = numpy.array(list(map(predict_closure, vectors)))
    return probs

def get_explanation(input_description, agency_name, agency_id):
    exp = explainer.explain_instance(
        input_description,
        lambda x: predict_from_input_text(x, agency_name),
        labels=[agency_id]
    )
    return exp.as_list(label=agency_id)

def get_explanation_for_description(title, agency_name):
    vector = target_frame[
        target_frame['title'].apply(lambda x: x.strip()) == data_loader.clean_input_text(title, agency_name).strip()
    ]['description'].values[0]
    agency_id = source_mapping[agency_name]
    return get_explanation(vector, agency_name, agency_id)

In [22]:
print(get_explanation_for_description('The Latest: 2 children killed after tree falls on car', 'Fox'))
                                 

[('authorities', 0.16115823139228405), ('say', 0.14557758698128415), ('texas', 0.09899131851340202), ('m', 0.0916018362601917), ('local', 0.0803815942926757), ('car', 0.07240246057977347), ('killed', 0.06725801839135034), ('two', 0.056468456092729796), ('all', 0.05437457508050377), ('states', -0.03510780725131933)]


In [23]:
class_names

['Breitbart',
 'CNN',
 'Drudge Report',
 'BBC',
 'Daily Mail',
 'NPR',
 'New York Times',
 'Vox',
 'Fox',
 'Wall Street Journal']

In [24]:
articles_to_consider = (
    ('chelsea 0-2 man utd: ander herrera and paul pogba give visitors victory', 'BBC'),
    ('mark levin: \'hate-america democrats passed a resolution telling you that america sucks\'', 'Breitbart'),
    ('white house attacks cohen before testimony about trump', 'CNN'),
    ('petition to unseat fiona onasanya opened to peterborough voters after disgraced mp refuses to quit', 'Daily Mail'),
    ('congress closes in on border deal... developing...', 'Drudge Report'),
    (': police list actions of dad held in son\'s death', 'Fox'),
    ('the united states\' strategy for venezuela', 'NPR'),
    ('jared kushner and ivanka trump use private accounts for official business, their lawyer says', 'New York Times'),
    ('andrew yang, the 2020 long-shot candidate running on a universal basic income, explained', 'Vox'),
    ('how bad is the china slowdown? u.s. companies offer some answers', 'Wall Street Journal')
)

In [25]:
def get_explanation_for_pairing(pairing, verbose=True):
    if verbose:
        print('Running ' + ', '.join(pairing))
    return get_explanation_for_description(pairing[0], pairing[1])

results = list(map(lambda x: get_explanation_for_pairing(x), articles_to_consider))

Running chelsea 0-2 man utd: ander herrera and paul pogba give visitors victory, BBC
Running mark levin: 'hate-america democrats passed a resolution telling you that america sucks', Breitbart
Running white house attacks cohen before testimony about trump, CNN
Running petition to unseat fiona onasanya opened to peterborough voters after disgraced mp refuses to quit, Daily Mail
Running congress closes in on border deal... developing..., Drudge Report
Running : police list actions of dad held in son's death, Fox
Running the united states' strategy for venezuela, NPR
Running jared kushner and ivanka trump use private accounts for official business, their lawyer says, New York Times
Running andrew yang, the 2020 long-shot candidate running on a universal basic income, explained, Vox
Running how bad is the china slowdown? u.s. companies offer some answers, Wall Street Journal


In [26]:
top_amounts = list(map(lambda x: list(map(lambda y: {'word': y[0], 'score': y[1]}, x)), results))

In [27]:
top_amounts_flat = [score for scores_sub in top_amounts for score in scores_sub]

In [28]:
top_amounts_frame = pandas.DataFrame(top_amounts_flat)

In [29]:
top_amounts_frame.to_csv('./top_amounts.csv')

In [31]:
results

[[('the', 0.21360788792455387),
  ('his', 0.15145320645199525),
  ('says', 0.13193405162724636),
  ('as', 0.0762133736227907),
  ('cup', 0.07425146768943328),
  ('to', 0.07258654650825991),
  ('manchester', 0.06393324035637822),
  ('finals', 0.05591211360443453),
  ('played', 0.0313020143761076),
  ('reach', 0.0218894731614337)],
 [('democrat', 0.008342656823702502),
  ('d', 0.008014944954810836),
  ('conservative', 0.007767560048520718),
  ('i', 0.007455549708196098),
  ('god', 0.007442760283685246),
  ('ocasio', 0.007325452335076072),
  ('thursday', 0.0072724503411199645),
  ('united', 0.007059380641807633),
  ('including', 0.006722234658894481),
  ('however', 0.006107696348716699)],
 [('s', 0.4024806557674963),
  ('donald', 0.10081866838389537),
  ('reports', 0.08602222648523632),
  ('is', 0.061846991543617925),
  ('watch', 0.06071939860475143),
  ('president', 0.0548132121590327),
  ('trump', 0.04960206740671895),
  ('hearing', 0.04766865629498838),
  ('to', -0.03925459007861488),
