### Topic modelling

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time 
import gensim.corpora as corpora
from gensim.models import LdaMulticore
import os, re
sub_dir = '/Users/dominicbates/Documents/GitHub/app-review-classifier/'
os.chdir(sub_dir)

from app_review_classifier.text_processing import TextCleaner, Embedder

In [2]:
reviews_df = pd.read_parquet('/Users/dominicbates/Documents/GitHub/app-review-classifier/data/reviews_sample.parquet')


In [3]:
# Get cleaner object
cleaner = TextCleaner(config={'stop_words':True,'ngrams':True,'lemmatization':True})

# Clean text
cleaned_reviews = cleaner.process_raw_text(reviews_df['review'], train_ngrams = True)

# Remove really short reviews
print('\nOriginal N. reviews:',len(cleaned_reviews))
cleaned_reviews = [review for review in cleaned_reviews if len(review)>=10]
print('After removing short reviews:',len(cleaned_reviews))

Cleaning up text and removing stopwords...
- Text cleaned in: 0.24 seconds

Training ngrams...
- Training done in: 1.29 seconds

Loading spacy model...
- Model loaded in: 0.6 seconds

Lemmasizing...
- Lemmatizing done in: 35.36 seconds

Original N. reviews: 14986
After removing short reviews: 5846


In [4]:
test = Embedder(method='word2vec')

In [5]:
test.fit(cleaned_reviews)

Fitting word2vec model
- Setting up model...
- Setup done!
- Building Vocab...
- Vocab built!
- Training Model...
- Model training finished!


In [35]:
default_configs = {'lda':{'num_topics':10,
                          'passes':2, # Default 1 just tried 50
                          'iterations':500, # Default 50
                          'random_state':1234}}

class TopicModel:
    
    def __init__(self, 
                 config=None):
        
        # Set config params
        if config is None:
            self.config = default_configs['lda']
        else:
            self.config = config
            
        self.model = None
        self.num_topics = self.config['num_topics']
        self.training_passes = self.config['passes']
        self.training_iterations = self.config['iterations']
        self.training_random_state = self.config['random_state']

    
    def get_corpus(self, documents, train_dict=False):
        print('Getting corpus...')
        t1=time.time()
        if train_dict==True:
            print('Training dictionary...')
            self.id2word = corpora.Dictionary(documents)
            
            if self.id2word is None:
                raise ValueError('No dictionary trained yet. Try ruynning get_corpus() with train=True')
            
        print('Done in: {} seconds'.format(round((time.time() - t1), 2)))
        return [self.id2word.doc2bow(text) for text in cleaned_reviews]
                    

    def fit(self, documents):

        corpus = self.get_corpus(documents, train_dict=True)
        
        print('\nTraining model...')
        t1=time.time()
        self.model = LdaMulticore(corpus=corpus,
                                  id2word=self.id2word,
                                  num_topics=self.num_topics,
                                  passes = self.training_passes, # Default 1 just tried 50
                                  iterations = self.training_iterations, # Default 50
                                  random_state=self.training_random_state,
                                  eval_every=10)        
        print('Model trained in: {} seconds'.format(round((time.time() - t1), 2)))


    def apply(self, documents):
        corpus = self.get_corpus(documents, train_dict=False)


In [36]:
test_model = TopicModel()
test_model.fit(cleaned_reviews)

Getting corpus...
Training dictionary...
Done in: 0.23 seconds

Training model...
Model trained in: 5.22 seconds


In [37]:
# default_configs = {'lda':{num_topics=10,
#                           passes = 2000, # Default 1 just tried 50
#                           iterations = 500, # Default 50
#                           random_state=1234}}


# print('Training model...')
# t1=time.time()
# lda_model = LdaMulticore(corpus=corpus,
#                          id2word=id2word,
#                          num_topics=10,
#                          passes = 2000, # Default 1 just tried 50
#                          iterations = 500, # Default 50
#                          random_state=1234,
#                          eval_every=10)
# print('Model trained in: {} seconds'.format(round((time.time() - t1), 2)))



In [38]:
test_model.model.print_topics()

# corpus=self.training_bow,
# num_topics=params["num_topics"],
# iterations=params["iterations"],
# id2word=self.training_dictionary,
# passes=params["passes"],
# alpha=params["alpha"],
# eta=params["eta"],

AttributeError: 'NoneType' object has no attribute 'print_topics'

In [None]:

# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]