In [12]:
import pickle
import pyLDAvis.sklearn
from run_sklearn import runNMF_sk, get_weights, runLDA_sk, loadDataFrame, get_links
from sklearn.model_selection import GridSearchCV
import time
from datetime import date
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## LDA Grid Search
Sweeping parameters of LDA, and visualizing how the perplexity and likelihood scores change

Tutorial: https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/


In [16]:
def create_LDA_sk(filename, search_params, number_topics=10):
    # unpickle dataframe
    df = loadDataFrame(filename)

    # unpickle links
    keyword = filename[7:(len(filename)-5)]
    links = get_links(keyword)
    links.append('Wikipedia')
    print("Length of links: ", len(links))

    # Vectorize data from df -> Bag of Words
    count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
    count_data = count_vectorizer.fit_transform(df['data'])

    # Initialize data being saved in text file and adding the date
    to_save = []
    to_save.append(str(date.today()))

    # Run and time LDA, since it usually takes a while to converge
    print("\nRunning LDA with SciKit Learn...\n")
    start = time.time()
    lda = LDA(n_components=11, max_iter=1000, learning_offset=50.,random_state=1).fit(count_data)
    
    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params)

    # Do the Grid Search
    model.fit(count_data)

    end = time.time()
    print('Runtime:', end - start)
    
    return lda, count_data, count_vectorizer;

In [None]:
# Runtime takes too long
search_term = ((input("Enter a topic: ")).lower())
filename = "./Text/" + search_term.replace(" ", "-") + ".pckl"

# Define Search Param
search_params = {'n_components': [5, 10, 15, 20], 'learning_decay': [.5, .7, .9]}

# lda = LDA(n_components=11, max_iter=1000, learning_offset=50.,random_state=1)
(model, count_data, count_vectorizer) = create_LDA_sk(filename, search_params)


Enter a topic: disney
./Google/Links/disney
Length of links:  101

Running LDA with SciKit Learn...



In [15]:
model

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=1000,
                          mean_change_tol=0.001, n_components=11, n_jobs=None,
                          perp_tol=0.1, random_state=1, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [61]:
# Create Document - Topic Matrix
lda_output = model.transform(count_data)

print(model.perplexity(count_data))

421.60226081377175


In [1]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(model, count_data, count_vectorizer, mds='tsne')
panel

NameError: name 'pyLDAvis' is not defined

In [2]:
# Get Log Likelihoods from Grid Search Output
n_topics = [5, 10, 15, 20]
log_likelihoods = [round(gscore.mean_validation_score) for gscore in model.cv_results_]

# Get Perplexity Scores from Grid Search output
perplexity_scores = [round(gscore.perplexity(count_data)) for gscore in model.cv_results_]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelihoods, label='Log Likelihoods')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelihood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

NameError: name 'model' is not defined