In [1]:
import gensim
from gensim import corpora, models, similarities
import nltk
import json

In [2]:
from collections import OrderedDict
from semanticparser import *
from tools import *

In [3]:
username = 'hillaryclinton'

In [17]:
with open('data/%s.txt' % username) as docs_file:
    documents = docs_file.read().replace('\n', ' ').decode('utf-8')

# break into sentences
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sents_all = sent_detector.tokenize(documents.strip())
sent_generator = nltk.bigrams(sents_all)
sents = [" ".join(s) for s in sent_generator]


In [18]:
texts = remove_stopwords(sents, True)
dictionary, lsi, index = texts_to_index(texts, 256, username)

In [19]:
with open('data/%s.json' % username, 'w') as f:
    json.dump(sents,f)

In [20]:
dictionary.save('data/%s.dict' % username) 
lsi.save('data/%s-corpus.lsi' % username)
index.save('data/%s-corpus.index' % username)

In [10]:
### test functions
def load_sents(username, root='data'):
    """ Load documents
            Preprocessed: dictionary, corpus, index, lsi
            Archives: documents
    """
    dictionary = corpora.Dictionary.load('%s/%s.dict' % (root,username))

    with open('%s/%s.json' % (root,username)) as docs_file:
        documents = json.load(docs_file)

    lsi = models.LsiModel.load('%s/%s-corpus.lsi' % (root,username))
    #index = similarities.MatrixSimilarity.load('data/%s-corpus.index' % username)
    index = similarities.Similarity.load('%s/%s-corpus.index' % (root,username))

    return documents, dictionary, lsi, index

def test_response(username, t):
    trash = [t]
    documents0, dictionary0, lsi0, index0 = load_sents(username)
    r = gen_response(documents0, dictionary0, lsi0, index0, t, trash, True)
    return r

def gen_response(sents, dictionary, lsi, index, t, trash, limit1=True):
    # tokenize input sentence
    clean_input = clean_str(t).lower().split()

    # get most similar post from input sentence
    sims = query_page(clean_input, dictionary, lsi, index)

    # repeat the process on the sentences in the doc
    sample = [sents[sims[0][0]], sents[sims[1][0]], sents[sims[2][0]], sents[sims[3][0]],
                sents[sims[4][0]]]

    rmult = []
    #reply = create_reply(sample, '')
    if sample and sample not in trash:
        # reply to the tweet
        if limit1:
            return sample
        else:
            rmult.append(sample)
    if limit1:
        return None
    else:
        return rmult


In [65]:
test_response(username, "what books do you read?")

['what', 'books', 'do', 'you', 'read?']
[u'When I saw it, I had to laugh. Was that really a good way to sell books?', u'Was that really a good way to sell books? In China?', u'A transition team, working with career professionals at State, deluged me with thick briefing books and inperson sessions on every topic imaginable, from the budget for the Building\u2019s cafeteria to the policy concerns of every member of Congress. I\u2019ve seen my fair share of briefing books, and I was impressed with the depth, magnitude, and order of these State Department products.', u'Thanks to Simon & Schuster, especially Chief Executive Officer Carolyn Reidy and my publisher and editor, Jonathan Karp. I\u2019ve now done five books with Carolyn, and it was once again a delight.', u'I had brought a stack of American books that I thought she would enjoy and a chew toy for her dog. She presented me with a silver necklace that she had designed herself, based on a seed pod from an ancient Burmese pattern.']


In [21]:
topics = [d[1] for d in dictionary.items()]

In [68]:
topics[0]

u'writings'

In [26]:
# Generate dialog files
dump = '<?xml version="1.0" encoding="UTF-8"?><dialog xsi:noNamespaceSchemaLocation="WatsonDialogDocument_1.0.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><flow><folder label="Main">'
documents0, dictionary0, lsi0, index0 = load_sents(username)
for top in topics[:10]:
    dump += '<input><grammar><item>*%s*</item></grammar><output><prompt selectionType="RANDOM">' % top
    responses = gen_response(sents, dictionary, lsi, index, top, [], True)
    for r in responses:
        dump += "<item>%s</item>" % r
    dump += "</prompt></output></input>"
dump += '</dialog>'

In [25]:
with open('data/%s.xml' % username, 'w') as f:
    json.dump(dump,f)