In [1]:
import os
import sqlite3

import lime.lime_text
import numpy
import pandas

import input_vector_util
import harness_util

Using TensorFlow backend.


In [2]:
PERSIST_PREDICTIONS = True

# Train

In [3]:
harness_factory = harness_util.TemplateHarnessFactory()

In [4]:
config = {
                "corpusCol": "description",
                "denseSize1": 32,
                "denseSize2": 16,
                "dropoutRate": 0,
                "kernelRegPenalty": 0.01,
                "method": "occurrence",
                "numWords": 10000,
                "sourceCol": "source",
                "sourceIdCol": "sourceId",
                "sourceIdVectorCol": "sourceIdVector",
                "tokenVectorCol": "tokenVector",
                "tokensCol": "tokens"
            }

In [5]:
harness = harness_factory.build(config, db_loc=os.path.join('data', 'articles_publish.db'))

In [6]:
results = harness.run('who-wrote-this', 'accept-descr-occ-manual', config)

Instructions for updating:
Colocations handled automatically by placer.
W&B Run: https://app.wandb.ai/sampottinger/who-wrote-this/runs/tq94co0v
Call `%%wandb` in the cell containing your training loop to display live results.


wandb: Wandb version 0.8.1 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Instructions for updating:
Use tf.cast instead.
Train on 40281 samples, validate on 5002 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Generate predictions

In [10]:
target_frame = results.get_data_frame()
model = results.get_model()

In [11]:
predictions = model.predict(numpy.array(target_frame['tokenVector'].tolist()))

In [12]:
source_mapping = results.get_source_ids()._NumericalSourceIdSet__mapping

In [13]:
source_mapping_invert = {}
for source in source_mapping:
    source_index = source_mapping[source]
    target_frame[source + '_prediction'] = predictions[:,source_index]
    source_mapping_invert[source_index] = source

In [14]:
target_frame['prediction'] = list(map(lambda x: source_mapping_invert[x], numpy.argmax(predictions, axis=1)))

In [15]:
output_frame = pandas.DataFrame()

In [16]:
target_frame.keys()

Index(['source', 'title', 'description', 'set_assignment', 'sourceId',
       'sourceIdVector', 'tokens', 'tokenVector', 'New York Times_prediction',
       'Drudge Report_prediction', 'Fox_prediction', 'CNN_prediction',
       'BBC_prediction', 'Daily Mail_prediction', 'NPR_prediction',
       'Breitbart_prediction', 'Vox_prediction',
       'Wall Street Journal_prediction', 'prediction'],
      dtype='object')

In [17]:
output_frame['title'] = target_frame['title']
output_frame['description'] = target_frame['description']
output_frame['actualSource'] = target_frame['source']
output_frame['setAssignment'] = target_frame['set_assignment']
output_frame['cnnScore'] = target_frame['CNN_prediction']
output_frame['foxScore'] = target_frame['Fox_prediction']
output_frame['dailyMailScore'] = target_frame['Daily Mail_prediction']
output_frame['drudgeReportScore'] = target_frame['Drudge Report_prediction']
output_frame['newYorkTimesScore'] = target_frame['New York Times_prediction']
output_frame['bbcScore'] = target_frame['BBC_prediction']
output_frame['breitbartScore'] = target_frame['Breitbart_prediction']
output_frame['wallStreetJournalScore'] = target_frame['Wall Street Journal_prediction']
output_frame['voxScore'] = target_frame['Vox_prediction']
output_frame['nprScore'] = target_frame['NPR_prediction']
output_frame['prediction'] = target_frame['prediction']

In [18]:
if PERSIST_PREDICTIONS:
    conn = sqlite3.connect('./articles.db')
    output_frame.to_sql('predictions', conn)
    conn.commit()

# Look at Word Importance

In [19]:
class_names = list(map(lambda x: source_mapping_invert[x], sorted(source_mapping_invert.keys())))

In [20]:
word_index = results.get_tokenizer().get_inner_tokenizer().word_index.items()
feature_names_dict = dict(map(reversed, word_index))

In [21]:
max_count =  max(feature_names_dict.keys())
feature_names = [''] * max_count
for i in range(0, max_count):
    feature_names[i] = feature_names_dict.get(i, '')

In [22]:
input_vals = numpy.array(target_frame['tokenVector'].tolist())

In [23]:
explainer = lime.lime_text.LimeTextExplainer(
    class_names=class_names
)

In [24]:
tokenizer = results.get_tokenizer().get_inner_tokenizer()
vectorizer = input_vector_util.OccurenceInputVectorizer()
data_loader = results.get_data_loader()

def predict_from_input_text(input_text, agency_name):
    input_text = map(lambda x: data_loader.clean_input_text(x, agency_name), input_text)
    tokens = tokenizer.texts_to_sequences(input_text)
    vectors = map(lambda x: vectorizer.prepare(10000, x), tokens)
    predict_closure = lambda x: model.predict(numpy.array([x,]))[0]
    probs = numpy.array(list(map(predict_closure, vectors)))
    return probs

def get_explanation(input_description, agency_name, agency_id):
    exp = explainer.explain_instance(
        input_description,
        lambda x: predict_from_input_text(x, agency_name),
        labels=[agency_id]
    )
    return exp.as_list(label=agency_id)

def get_explanation_for_description(title, agency_name):
    vector = target_frame[
        target_frame['title'].apply(lambda x: x.strip()) == data_loader.clean_input_text(title, agency_name).strip()
    ]['description'].values[0]
    agency_id = source_mapping[agency_name]
    return get_explanation(vector, agency_name, agency_id)

In [25]:
print(get_explanation_for_description('The Latest: 2 children killed after tree falls on car', 'Fox'))
                                 

[('say', 0.13706295809830146), ('m', 0.13596317084180362), ('local', 0.12900234840320726), ('authorities', 0.1278754147337805), ('texas', 0.09199395935073976), ('times', 0.08200750480224388), ('being', 0.073142184067778), ('a', 0.059606062019950626), ('southern', 0.04438793729975369), ('killed', 0.037774898900539755)]


In [26]:
class_names

['New York Times',
 'Drudge Report',
 'Fox',
 'CNN',
 'BBC',
 'Daily Mail',
 'NPR',
 'Breitbart',
 'Vox',
 'Wall Street Journal']

In [28]:
articles_to_consider = (
    ('jones to bring in expert in bid to fix england\'s mental weakness under pressure', 'BBC'),
    ('mark levin: \'hate-america democrats passed a resolution telling you that america sucks\'', 'Breitbart'),
    ('the move shows a potential growing threat to the president and those in his orbit from probes by the manhattan us attorney\'s office', 'CNN'),
    ('children behind half of london knife crime as machete is sold for just £19', 'Daily Mail'),
    ('pelosi warns dems: stay in center; trump may contest election results...', 'Drudge Report'),
    (': vegas police: wounded robbery suspect has died', 'Fox'),
    ('retired military officers urge caution in proposed diplomatic spending cuts', 'NPR'),
    ('french raise a glass to a health warning about too much wine', 'New York Times'),
    ('google employees walked out for the right to sue their bosses. now they’re taking the fight to congress.', 'Vox'),
    ('how bad is the china slowdown? u.s. companies offer some answers', 'Wall Street Journal')
)

In [30]:
def get_explanation_for_pairing(pairing, verbose=True):
    if verbose:
        print('Running ' + ', '.join(pairing))
    return get_explanation_for_description(pairing[0], pairing[1])

results = list(map(lambda x: get_explanation_for_pairing(x), articles_to_consider))

Running jones to bring in expert in bid to fix england's mental weakness under pressure, BBC
Running mark levin: 'hate-america democrats passed a resolution telling you that america sucks', Breitbart
Running the move shows a potential growing threat to the president and those in his orbit from probes by the manhattan us attorney's office, CNN
Running children behind half of london knife crime as machete is sold for just £19, Daily Mail
Running pelosi warns dems: stay in center; trump may contest election results..., Drudge Report
Running : vegas police: wounded robbery suspect has died, Fox
Running retired military officers urge caution in proposed diplomatic spending cuts, NPR
Running google employees walked out for the right to sue their bosses. now they’re taking the fight to congress., Vox
Running how bad is the china slowdown? u.s. companies offer some answers, Wall Street Journal


In [31]:
top_amounts = list(map(lambda x: list(map(lambda y: {'word': y[0], 'score': y[1]}, x)), results))

In [32]:
top_amounts_flat = [score for scores_sub in top_amounts for score in scores_sub]

In [33]:
top_amounts_frame = pandas.DataFrame(top_amounts_flat)

In [34]:
top_amounts_frame.to_csv('./top_amounts.csv')

In [37]:
articles_to_consider = (
    ('the papers: climate protests and trump probe', 'BBC'),
    ('warren: climate change, gun violence, student loan debt constitute for national emergency declaration', 'Breitbart'),
    ('john avlon speaks the cold truth about climate change', 'CNN'),
    ('dramatic moment police drag two climate change protesters along the street', 'Daily Mail'),
    ('climate-first...', 'Drudge Report'),
    ('trump pokes fun at klobuchar\'s climate-change stance as she announces candidacy in snow', 'Fox'),
    ('the role climate change plays in weather extremes', 'NPR'),
    ('nonfiction: striking a balance between fear and hope on climate change', 'New York Times'),
    ('amazon says it’s a leader on fighting climate change. 5,000 employees disagree.', 'Vox'),
    ('glencore, the king of coal, bows to investor pressure over climate', 'Wall Street Journal')
)

In [41]:
results = list(map(lambda x: get_explanation_for_pairing(x), articles_to_consider))

Running the papers: climate protests and trump probe, BBC
Running warren: climate change, gun violence, student loan debt constitute for national emergency declaration, Breitbart
Running john avlon speaks the cold truth about climate change, CNN
Running dramatic moment police drag two climate change protesters along the street, Daily Mail
Running climate-first..., Drudge Report
Running trump pokes fun at klobuchar's climate-change stance as she announces candidacy in snow, Fox
Running the role climate change plays in weather extremes, NPR
Running nonfiction: striking a balance between fear and hope on climate change, New York Times
Running amazon says it’s a leader on fighting climate change. 5,000 employees disagree., Vox
Running glencore, the king of coal, bows to investor pressure over climate, Wall Street Journal


In [42]:
top_amounts = list(map(lambda x: list(map(lambda y: {'word': y[0], 'score': y[1]}, x)), results))

In [43]:
top_amounts_flat = [score for scores_sub in top_amounts for score in scores_sub]

In [44]:
top_amounts_frame = pandas.DataFrame(top_amounts_flat)

In [45]:
top_amounts_frame.to_csv('./top_amounts_climate.csv')

In [48]:
results

[[('the', 0.3265776211505303),
  ('us', 0.24944025371607373),
  ('and', 0.19270066579393708),
  ('react', 0.10901990652313094),
  ('say', 0.09117311466037219),
  ('papers', 0.07795451441078037),
  ('to', 0.07557353934402937),
  ('on', 0.04416164457409756),
  ('activists', -0.03988923000636548),
  ('target', -0.036111089308746444)],
 [('d', 0.03315604140077073),
  ('host', 0.026210406337261752),
  ('if', 0.02216439110771031),
  ('tuesday', 0.018306737917309916),
  ('gun', 0.018233226780838344),
  ('which', 0.01693026186902366),
  ('with', -0.01647727892667663),
  ('we', 0.014800896037960792),
  ('twitter', -0.011222291530080829),
  ('president', -0.007855393371712921)],
 [('s', 0.571472792866239),
  ('donald', 0.2578055049386025),
  ('john', 0.09360219347686867),
  ('as', -0.0563349288738391),
  ('to', -0.045675853260542125),
  ('president', 0.043325460597711654),
  ('continues', 0.03800389366173404),
  ('explains', 0.03064153578253951),
  ('between', 0.029809118188025136),
  ('two', 0.