In [3]:
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from gensim import matutils
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
# in case the dataset is very large, distributed implementation: 
# but alpha and eta need to be provided explicitly
from gensim.models.ldamulticore import LdaMulticore 
from gensim.models.hdpmodel import HdpModel
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings('ignore')

In [35]:
class topic_model(object):
    def __init__(self, df, num_topics, lemmatise=True, 
        stem=False, alpha=None, eta=None, custom_stop_words=None, 
        **kwargs):
        '''
        if stemming/lemmatising needs to be applied, add: tokenizer=LemmaTokenizer()
        to the list of arguments
        '''
        
        #### 1. HYPERPARAMETERS ####
        if alpha is not None: 
            self.alpha = alpha 
        # learns an asymmetric prior directly from your data
        else: 
            self.alpha = 'auto'
        if eta is not None: 
            self.eta = eta 
        # learns an asymmetric prior directly from your data
        else: 
            self.eta = 'auto'

        self.num_topics = num_topics
        
        #### 2. Document-term matrix ####
        
        text_data = df['text']

        # Scikit-learn methods
        vectorizer = CountVectorizer(stop_words='english', **kwargs)
        self.dtm = vectorizer.fit_transform(text_data)
        #self.stop_words = vectorizer.stop_words_
        #self.vocab = dict((value, key) for key, value in vectorizer.vocabulary_.items())

        # Gensim methods - default
        stop_words = set(stopwords.words('english'))
        stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '..', '...'])
        if type(custom_stop_words) == 'list' and custom_stop_words is not None:
            stop_words.update(custom_stop_words)

        lemmatiser = WordNetLemmatizer()
        p_stemmer = PorterStemmer()
        docs = []
        for doc in text_data:
            raw = doc.lower()
            if lemmatise:
                tokens = [lemmatiser.lemmatize(t) for t in word_tokenize(raw)] # lemmatisation of tokens
            else:
                tokens = word_tokenize(raw)
            filtered_tokens = [i for i in tokens if i not in stop_words]
            if stem:
                filtered_tokens = [p_stemmer.stem(i) for i in filtered_tokens]
            docs.append(filtered_tokens)
        self.vocab = Dictionary(docs)
        self.corpus = [self.vocab.doc2bow(text) for text in docs]

        self.num_docs, self.vocab_size = (len(docs), len(self.vocab.keys()))
        print('Length of corpus:\t %d documents' %self.num_docs)
        print('Size of vocabulary:\t %d terms' %self.vocab_size)

    def fit_lda(self, maxiter=1000):
        print('Fitting LDA')
        self.model = LdaModel(self.corpus, 
            num_topics=self.num_topics,
            id2word=self.vocab, # id2word=dict([(i, s) for i, s in enumerate(vectorizer.get_feature_names())]
            alpha=self.alpha,
            eta=self.eta,
            iterations=maxiter)
        print('Learning completed successfully!')
        
    def summarise(self, show_prob = True, **kwargs):
        n_top_words = 8
        if show_prob:
            for i in range(self.num_topics):
                print('Topic {}'.format(i))
                print(self.model.show_topic(i, **kwargs))
        else:
            self.model.print_topics(-1)

    def visualise(self, **kwargs):
        '''
        API documentation: http://pyldavis.readthedocs.io/en/latest/modules/API.html
        '''

        pyLDAvis.enable_notebook()
        vis_data = pyLDAvis.gensim.prepare(self.model, self.corpus, self.vocab, sort_topics=False, **kwargs)
        pyLDAvis.display(vis_data)

        pyLDAvis.show(vis_data)

    def fit_hdp(self, iterations=100):
        '''
        The Hierarchical Dirichlet Process: learns the number of topics automatically. T is the max number of topics allowed
        '''
        self.model = HdpModel(matutils.Sparse2Corpus(self.dtm), self.vocab, T=50)

In [16]:
data = pd.read_csv('minutes_data.txt', sep='\t')

In [36]:
topic_model = topic_model(data, 10, lemmatise=False)

Length of corpus:	 7659 documents
Size of vocabulary:	 7881 terms


In [37]:
lda = LdaModel(topic_model.corpus, 
            num_topics=topic_model.num_topics,
            id2word=topic_model.vocab, # id2word=dict([(i, s) for i, s in enumerate(vectorizer.get_feature_names())]
            alpha=topic_model.alpha,
            eta=topic_model.eta,
            iterations=10000,
              passes=5)

In [39]:
vis_data = pyLDAvis.gensim.prepare(lda, topic_model.corpus, topic_model.vocab, sort_topics=False)

In [40]:
pyLDAvis.display(vis_data)