In [None]:
import sqlite3

import numpy
import pandas

import harness_util

# Train

In [None]:
harness_factory = harness_util.TemplateHarnessFactory()

In [None]:
config = {
                "corpusCol": "title",
                "lstmSize": 32,
                "dropoutRate": 0,
                "kernelRegPenalty": 0.01,
                "method": "sequence",
                "numWords": 3000,
                "sourceCol": "source",
                "sourceIdCol": "sourceId",
                "sourceIdVectorCol": "sourceIdVector",
                "tokenVectorCol": "tokenVector",
                "tokensCol": "tokens",
                "maxSeqLen": 50
        }

In [None]:
harness = harness_factory.build(config)

In [None]:
results = harness.run('who-wrote-this', 'accept-descr-lstm-manual', config)

# Generate predictions

In [None]:
target_frame = results.get_data_frame()
model = results.get_model()

In [None]:
predictions = model.predict(numpy.array(target_frame['tokenVector'].tolist()))

In [None]:
source_mapping = results.get_source_ids()._NumericalSourceIdSet__mapping

In [None]:
source_mapping_invert = {}
for source in source_mapping:
    source_index = source_mapping[source]
    target_frame[source + '_prediction'] = predictions[:,source_index]
    source_mapping_invert[source_index] = source

In [None]:
target_frame['prediction'] = list(map(lambda x: source_mapping_invert[x], numpy.argmax(predictions, axis=1)))

In [None]:
output_frame = pandas.DataFrame()

In [None]:
target_frame.keys()

In [None]:
output_frame['title'] = target_frame['title']
output_frame['description'] = target_frame['description']
output_frame['actualSource'] = target_frame['source']
output_frame['setAssignment'] = target_frame['set_assignment']
output_frame['cnnScore'] = target_frame['CNN_prediction']
output_frame['foxScore'] = target_frame['Fox_prediction']
output_frame['dailyMailScore'] = target_frame['Daily Mail_prediction']
output_frame['drudgeReportScore'] = target_frame['Drudge Report_prediction']
output_frame['newYorkTimesScore'] = target_frame['New York Times_prediction']
output_frame['bbcScore'] = target_frame['BBC_prediction']
output_frame['breitbartScore'] = target_frame['Breitbart_prediction']
output_frame['wallStreetJournalScore'] = target_frame['Wall Street Journal_prediction']
output_frame['voxScore'] = target_frame['Vox_prediction']
output_frame['nprScore'] = target_frame['NPR_prediction']
output_frame['prediction'] = target_frame['prediction']

In [None]:
conn = sqlite3.connect('./articles.db')
output_frame.to_sql('predictions', conn)
conn.commit()

# Look at Words

In [None]:
target_frame['numTokens'] = target_frame['tokens'].apply(lambda x: len(x))

In [None]:
word_counts = {}
word_totals = {}

for i, row in target_frame.iterrows():
    source = row['source']

    if not source in word_counts:
        word_counts[source] = {}
    
    for token in row['tokens']:
        if not token in word_counts[source]:
            word_counts[source][token] = 0
        if not token in word_totals:
            word_totals[token] = 0
        word_counts[source][token] += 1
        word_totals[token] += 1

In [None]:
for source in word_counts:
    for token in word_counts[source]:
        word_counts[source][token] /= word_totals[token]

In [None]:
max_tokens = {}

for source in word_counts:
    max_token = -1
    max_count = 0
    
    for token in word_counts[source]:
        candidate_count = word_counts[source][token]
        if candidate_count > max_count:
            max_count = candidate_count
            max_token = token
    
    max_tokens[source] = {'tokenId': max_token, 'percent': max_count}

In [None]:
max_tokens

In [None]:
tokenizer = results.get_tokenizer()