In [1]:
import gensim
from gensim import corpora, models, similarities
import nltk
import json

In [2]:
from collections import OrderedDict
from semanticparser import *
from tools import *

In [77]:
username = 'hollymadison'

In [78]:
with open('data/%s.txt' % username) as docs_file:
    documents = docs_file.read().replace('\n', ' ').decode('utf-8')

# break into sentences
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sents_all = sent_detector.tokenize(documents.strip())
sent_generator = nltk.bigrams(sents_all)
sents = [" ".join(s) for s in sent_generator]


In [79]:
texts = remove_stopwords(sents, True)
dictionary, lsi, index = texts_to_index(texts, 256, username)

In [80]:
with open('data/%s.json' % username, 'w') as f:
    json.dump(sents,f)

In [81]:
dictionary.save('data/%s.dict' % username) 
lsi.save('data/%s-corpus.lsi' % username)
index.save('data/%s-corpus.index' % username)

In [39]:
### test functions
def load_sents(username, root='data'):
    """ Load documents
            Preprocessed: dictionary, corpus, index, lsi
            Archives: documents
    """
    dictionary = corpora.Dictionary.load('%s/%s.dict' % (root,username))

    with open('%s/%s.json' % (root,username)) as docs_file:
        documents = json.load(docs_file)

    lsi = models.LsiModel.load('%s/%s-corpus.lsi' % (root,username))
    #index = similarities.MatrixSimilarity.load('data/%s-corpus.index' % username)
    index = similarities.Similarity.load('%s/%s-corpus.index' % (root,username))

    return documents, dictionary, lsi, index

def test_response(username, t):
    trash = [t]
    documents0, dictionary0, lsi0, index0 = load_sents(username)
    r = gen_response(documents0, dictionary0, lsi0, index0, t, trash, True)
    return r

def gen_response(sents, dictionary, lsi, index, t, trash, limit1=True):
    # tokenize input sentence
    clean_input = clean_str(t).lower().split()

    # get most similar post from input sentence
    sims = query_page(clean_input, dictionary, lsi, index)

    # repeat the process on the sentences in the doc
    sample = [sents[sims[0][0]], sents[sims[1][0]], sents[sims[2][0]], sents[sims[3][0]],
                sents[sims[4][0]], sents[sims[5][0]], sents[sims[6][0]], sents[sims[7][0]],
                sents[sims[8][0]], sents[sims[9][0]]]

    rmult = []
    #reply = create_reply(sample, '')
    if sample and sample not in trash:
        # reply to the tweet
        if limit1:
            return sample
        else:
            rmult.append(sample)
    if limit1:
        return None
    else:
        return rmult


In [87]:
test_response(username, "gay marriage?")

[u'Don\u2019t you miss the mansion?\u201d squealed a round-faced, wholesomelooking 20-something girl in a high-pitched voice. \u201cUm,\u201d I started, unsure of how to answer her politely.',
 u'\u201cUm,\u201d I started, unsure of how to answer her politely. \u201cNo .',
 u'\u201cNo . .',
 u'. .',
 u'. ?\u201d I said, offering her a halfhearted smile.',
 u'?\u201d I said, offering her a halfhearted smile. Here I was, an independent, successful woman, making millions of dollars a year (all on my own), headlining a hit show on the Las Vegas Strip, coproducing and starring in my own television show, and this woman was asking me if I missed the mansion?',
 u'Here I was, an independent, successful woman, making millions of dollars a year (all on my own), headlining a hit show on the Las Vegas Strip, coproducing and starring in my own television show, and this woman was asking me if I missed the mansion? Clearly the public perception of the life I shared with Hugh Hefner at the Playboy Man

In [83]:
topics = [d[1] for d in dictionary.items()]

In [84]:
topics[:10]

[u'fawn',
 u'\u201cpeeping\u201d',
 u'yellow',
 u'four',
 u'askew',
 u'woods',
 u'hanging',
 u'marching',
 u'looking',
 u'granting']

In [37]:
# Generate dialog files
dump = '<?xml version="1.0" encoding="UTF-8"?><dialog xsi:noNamespaceSchemaLocation="WatsonDialogDocument_1.0.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><flow><folder label="Main">'
dump = ""
documents0, dictionary0, lsi0, index0 = load_sents(username)
for top in topics[:1000]:
    dump += '<input><grammar><item>*%s*</item></grammar><output><prompt selectionType="RANDOM">' % top
    responses = gen_response(sents, dictionary, lsi, index, top, [], True)
    for r in responses:
        dump += "<item>%s</item>" % r
    dump += "</prompt></output></input>"
dump += '</folder></flow></dialog>'

In [38]:
with open('data/%s.xml' % username, 'w') as f:
    json.dump(dump,f)