In [1]:
import os
import re
import lxml
import cchardet
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, remove_stopwords, strip_tags, strip_numeric, strip_non_alphanum
from gensim.models import CoherenceModel, Phrases
from bs4 import BeautifulSoup
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from ranker import load_ranker

In [32]:
top_directory = "../compiled_bios"
filters_path = "../"
def get_array_from_file(filters_path):
    array_file = open(filters_path, 'r')
    array = re.split(' |\n|\t', array_file.read().lower())
    array_file.close()
    array = [arr.replace('\r', '') for arr in array if len(arr.replace('\r', '')) > 0]
    print(array[:10])
    return set(array)

def iter_documents(top_directory, filters):
    custom_filter = [lambda x: x.lower(), strip_punctuation, remove_stopwords, strip_tags, strip_non_alphanum, strip_numeric]
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = open(os.path.join(root, file)).read().lower() # read the entire document, as one big string
            soup = BeautifulSoup(document, "lxml")
            document = soup.get_text(separator='\n').split(' ')
            document = preprocess_string(' '.join(document), custom_filter)
            document = [word for word in document if word not in filters]
            document = [word for word in document if len(word) > 4]
            
            yield document 

class CorpusContainer(object):
    def __init__(self, top_dir, filters_path):
        self.top_dir = top_dir
        filter_set = self.__construct_filter_set(filters_path) 
        self.docs = list(iter_documents(top_dir, filter_set))
        self.bigram = Phrases(self.docs, min_count=2)
        self.trigram = Phrases(self.bigram[self.docs], min_count=2)
        self.__append_bigrams()
        self.__append_trigrams()
        self.dictionary = gensim.corpora.Dictionary(self.docs)
        self.dictionary.filter_extremes(no_below=10, no_above=.8, keep_n=30000) # check API docs for pruning params
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.docs]
        
    def __iter__(self):
        for tokens in iter_documents(self.top_dir, self.filters):
            yield self.dictionary.doc2bow(tokens)
            
    def __len__(self):
        return len(self.docs)
    
    def __construct_filter_set(self, filters_path):
        filters = get_array_from_file(filters_path + 'names.txt')
        filters.update(get_array_from_file(filters_path + 'unis'))
        filters.update(get_array_from_file(filters_path + 'urls'))
        filters.update(get_array_from_file(filters_path + 'emails'))
        filters.update(get_array_from_file(filters_path + 'depts'))
        filters.update(get_array_from_file(filters_path + 'location'))
        filters.update(get_array_from_file(filters_path + 'unwanted_words.txt'))
        return filters
    
    
    def __append_bigrams(self):
        for idx in range(len(self.docs)):
            for token in self.bigram[self.docs[idx]]:
                if '_' in token:
                    split_token = token.split('_')
                    for tok in split_token:
                        self.docs[idx] = filter(lambda word: word != tok, self.docs[idx])
                    # Token is a bigram, add to document.
                    self.docs[idx].append(token)
    
    
    def __append_trigrams(self):
        for idx in range(len(self.docs)):
            for token in self.trigram[self.bigram[self.docs[idx]]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    split_token = token.split('_')
                    for tok in split_token:
                        self.docs[idx] = filter(lambda word: word != tok, self.docs[idx])
                    self.docs[idx].append(token)
    
        

corpus_container = CorpusContainer(top_directory, filters_path) # create a dictionary
#for vector in corpus: # convert each document to a bag-of-word vector
    #print vector

['tarek', 'sarita', 'v.', 'adve', 'richard', 't.', 'cheng', 'donald', 'b.', 'gillies']
['university', 'of', 'illinois', 'at', 'urbana-champaign', 'university', 'of', 'illinois', 'at', 'urbana-champaign']
['http://abdelzaher.cs.illinois.edu', 'http://www.cs.uiuc.edu/~sadve', 'http://vikram.cs.illinois.edu/', 'https://cs.illinois.edu/directory/profile/agha', 'https://cs.illinois.edu/directory/profile/alawini', 'https://parasol.tamu.edu/~amato/', 'https://cs.illinois.edu/directory/profile/angrave', 'https://cs.illinois.edu/directory/profile/bpbailey', 'https://cs.illinois.edu/directory/profile/batesa', 'https://cs.illinois.edu/directory/profile/mattox']
['sadve@illinois.edu', 'agha@illinois.edu', 'alawini@illinois.edu', 'amato@tamu.edu', 'angrave@illinois.edu', 'bpbailey@illinois.edu', 'batesa@illinois.edu', 'mattox@illinois.edu', 'caesar@illinois.edu', 'rhc@illinois.edu']
['computer', 'science', 'computer', 'science', 'computer', 'science', 'computer', 'science', 'computer', 'science']
[

In [33]:
lda_model = gensim.models.ldamodel.LdaModel(corpus_container.corpus, 
                                           id2word = corpus_container.dictionary,
                                           num_topics = 10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [34]:
pprint(lda_model.print_topics())

[(0,
  u'0.015*"material" + 0.012*"applied_physics" + 0.012*"climate_change" + 0.009*"inverse_problems" + 0.007*"chemical_biomolecular" + 0.006*"exchange" + 0.006*"numerical_methods" + 0.006*"halluniversity_californiaberkeley" + 0.005*"energy_conversion" + 0.005*"ordinary_differential"'),
 (1,
  u'0.069*"machine_learning" + 0.025*"large_scale" + 0.020*"artificial_intelligence" + 0.013*"decision_processes" + 0.008*"gaussian_processes" + 0.008*"business_analytics" + 0.008*"arxiv" + 0.008*"natural_language" + 0.008*"computational_complexity" + 0.007*"neural_networks"'),
 (2,
  u'0.014*"models" + 0.009*"power" + 0.009*"efficient" + 0.008*"based" + 0.007*"static_analysis" + 0.006*"programming_languages" + 0.006*"syntax_semantics" + 0.005*"approach" + 0.005*"graph_theory" + 0.005*"volume"'),
 (3,
  u'0.014*"electromagnetics_electronic" + 0.012*"operations_management" + 0.009*"staffdirectory_photo" + 0.008*"protection_noticeaccessibilityaccountabilityaccreditationemployment" + 0.008*"coronavi

In [35]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus_container.docs, dictionary=corpus_container.dictionary, coherence='c_v')
coherence_model_lda.get_coherence()

0.46273893936848876

In [36]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus_container.corpus, corpus_container.dictionary)
vis