In [13]:
import os
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim import matutils
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
import pandas as pd
from gensim import corpora, models, similarities
from bs4 import BeautifulSoup
import re
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# data_path = os.path.expanduser('~') + "romney_2012_rnc.txt"
data_path = "romney_2012_rnc.txt"
with open(data_path, 'rw+') as f:
    data = f.read().strip().decode('utf-8')

data[:380]

u'\ufeffMr. Chairman, delegates. I accept your nomination for President of the United States of America.\r\n\r\nI do so with humility, deeply moved by the trust you have placed in me. It is a great honor. It is an even greater responsibility.\r\n\r\nTonight I am asking you to join me to walk together to a better future. By my side, I have chosen a man with a big heart from a small town. He re'

In [15]:
stops = set(stopwords.words("english")) 

def clean_tokenize(words):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML
    review_text = BeautifulSoup(words, "lxml").get_text() 
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return tokenize.word_tokenize(( " ".join( meaningful_words )))

In [18]:
words = clean_tokenize(data)

In [26]:
def compute_top_topic(docs):
    # Function to create a list of integers which represent the top topic for each person. 
    # I could have looked at the top 5 topics too and build a longer topic vector to compare 2 users but I keep it simple for now
    
    # create a list of cleaned and tokenized documents
    parsed = [clean_tokenize(s) for s in docs]
    
    # create a dictionary for each unique word from all documents
    dictionary = corpora.Dictionary(parsed)
    print dictionary
    # ignore words that appear in less than 20 documents or more than 10% documents
    dictionary.filter_extremes(no_below=20, no_above=0.1)
    print dictionary
    
    # create a list of bow for each document (worddicID, count)
    corpus = [dictionary.doc2bow(text) for text in parsed] # bow for each row in documents
    
    # what is the most common word in the that article?
    most_index, most_count = max(corpus[0], key=lambda (word_index, count): count)
    print most_index, most_count
    print dictionary[most_index]
    
    # create a tfidf from bow
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus] # tfidf for each row in documents
    
    lda = LdaModel(corpus_tfidf, id2word=dictionary, num_topics=20, update_every=0, passes=20)
    lda.print_topics(num_topics=20, num_words=20)
    
    get_top_topic = lambda person: sorted(lda.get_document_topics(person), key= lambda (x,y): y, reverse=True)[0][0]
    
    return [get_top_topic(person) for person in corpus_tfidf]