# Testing Topic Modelling

This has to be done on full dataset. Let's see if my computer can handle it. Else we will move this to Google Colab.  

I first try to base the topics on the tf-idf vectors.  

In [None]:
# general:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import time # will be use to choose the faster solution
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer # Normalize samples individually to unit norm.

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import sklearn
import pickle # to save models, for instance LDA outputs


# NLP:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD # LSA
from sklearn.decomposition import NMF # NMF
from sklearn.decomposition import LatentDirichletAllocation # LDA


In [None]:
# load data:
raw_winedata = pd.read_csv("../data/winemag-data-190314.csv").drop("Unnamed: 0", axis=1) # indexes were stored as col
initial_number_of_rows = raw_winedata.shape[0]
print(raw_winedata.shape)
raw_winedata.head()


In [None]:
# try to keep only varieties that occur more than n times:
n = 500
variety_counts = raw_winedata.variety.value_counts()
winedata = raw_winedata[raw_winedata.variety.isin(variety_counts.index[variety_counts.gt(n)])] # pandas.DataFrame.gt = get greater
print(winedata.shape)


Even if we strongly limit the minimal number of occurences for a variety, we still get a large dataset, and computations will be slow on a laptop.

In [None]:
# try to keep only province that occur more than n times:
n = 500
province_counts = winedata.province.value_counts()
winedata = winedata[winedata.province.isin(province_counts.index[province_counts.gt(n)])] # pandas.DataFrame.gt = get greater
print(winedata.shape)

**We keep only the varieties and provinces that are occuring more than 500 times, because we want to try to predict those - in a parallel exercise as building the unsupervised learning system - using the text desciption.** *texte en italique*

In [None]:
domain_specific_stopwords = ['$', ' ', '’s', 'wine', 'winemaker', 'winemaking', 'winery', 
                            '2020–2030', '2–3', '3–4', '4–5', '5–6', '6–8', 
                             'château', 'village', 'beaujolais', 'domaine', 
                             'côte', 'saint', 'village', 'parcel', 'parcels',
                             'I', 'flavors',
                             'now-2015', 'now-2018', 'now-2025']
                        # consider:
#                             cabernet, cabernets, douro, nacional,  
#                              widely, muscat,
#                              willamette, bordeaux, pommard, rioja, barbaresco, chianti]

# a list of all wine varieties. this is not perfect as some varieties are compound words. But it's a start.
variety_stopwords = winedata.variety.unique().tolist()
variety_stopwords = [str(variety).lower() for variety in variety_stopwords] # convert to lowercase
variety_stopwords.append(['pinot', 'cabernet', 'cabernets', 'sauvignon', 'grigio', 'sirah' ]) # as frequent and only happens as compound word

# a list of all wine provinces. 
province_stopwords = winedata.province.unique().tolist()
province_stopwords = [str(province).lower() for province in province_stopwords] # convert to lowercase

In [None]:
# Updating our tokenizer function:
def tokenize_and_clean(description):

    # Tokenize:
    mytokens = nlp(description)

    # Removing stopwords, punctuation and convert to lower_case + AND NUMBERS (or use is_digit?)
    mytokens = [token.lower_ for token in mytokens if not token.is_punct and not token.is_stop and not token.like_num and not token.is_digit]
    
    # remove domain-specific stopwords:
    mytokens = [token for token in mytokens if token not in domain_specific_stopwords]
            
    # remove wine variety occurrences in the descriptions:
    mytokens = [token for token in mytokens if token not in variety_stopwords]
                
    # Return preprocessed list of tokens
    return mytokens

In [None]:
# We do not need word vectors here, so we can upload the small English model from spaCy:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])

In [None]:
time0 = time.time()

tfidf_vector = TfidfVectorizer(tokenizer = tokenize_and_clean, # using our custom tokenizer
                               ngram_range=(1,1),
                               max_df=0.95, # ignore t that have a df higher than max_df (corpus-specific stopwords)
                               min_df=10, # ignore terms that have a doc freq lower than threshold.
                               max_features=2000
                            )
# Applying the vectorizer:
wine_tfidf = tfidf_vector.fit_transform(winedata.description) # input: the column "description"

# Getting the word list.
terms = tfidf_vector.get_feature_names()

print('Done! it took', time.time()-time0, 'seconds.')

## Trying Topic Modelling

In [None]:
# Number of topics:
ntopics=5

# Number of words to look at for each topic.
n_top_words = 10

# to print the top n words:
topwords = pd.DataFrame(index=range(0,ntopics))


### LSA and NMF

In [None]:
# Linking words to topics
def word_topic(tfidf, solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic = tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components = pd.DataFrame(words_by_topic, index=wordlist)
    
    return components

In [None]:
# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    
    index = np.repeat(n_topics, n_top_words, axis=0)
    topwords = pd.Series(index=index) # initiate a Series where to store the topwords of each topic
    fullist=[] # usual code doesn't work
    
    for column in n_topics:
        # Sort the column so that highest loadings are at the top.
        sortedwords = components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen = sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist = chosen.index + "  "+ round(chosen,2).map(str)
        
        fullist.append(chosenlist)
    
#     topwords = pd.Series(fullist, index=index)
        
#         topwords.loc[column] = chosenlist
    return(fullist) # (topwords)

In [None]:
# LSA

time0 = time.time()

svd = TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
wine_lsa = lsa.fit_transform(wine_tfidf)

components_lsa = word_topic(wine_tfidf, wine_lsa, terms)

topwords['LSA'] = top_words(components_lsa, n_top_words) 

print('Done! It took', time.time()-time0, 'seconds.')


In [None]:
# NNMF
time0 = time.time()

svd = TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
wine_lsa = lsa.fit_transform(wine_tfidf)

components_lsa = word_topic(wine_tfidf, wine_lsa, terms)


nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
wine_nmf = nmf.fit_transform(wine_tfidf) 

components_nmf = word_topic(wine_tfidf, wine_nmf, terms)

topwords['NMF'] = top_words(components_nmf, n_top_words) 
    
print('Done! It took', time.time()-time0, 'seconds.')


In [None]:
for i in range(0,ntopics):
    print('topic', i, ':\nLSA:\n', topwords.LSA[i], '\nNMF:\n', topwords.NMF[i],'\n')


### LDA

In [None]:
# LDA

time0 = time.time()

lda = LatentDirichletAllocation(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )
wine_lda = lda.fit(wine_tfidf)

print('Done! It took', time.time()-time0, 'seconds.')


Print topics:


## pyLDAvis

A good topic model will have non-overlapping, fairly big sized blobs for each topic.

In [None]:
pyLDAvis.sklearn.prepare(wine_lda, wine_tfidf, tfidf_vector, mds='PCoA') # try also mds='tsne'

In [None]:
time0 = time.time()

# Log Likelyhood: Higher the better
print("Log Likelihood: ", wine_lda.score(wine_tfidf))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", wine_lda.perplexity(wine_tfidf))

print('Done! it took', time.time()-time0, 'sec.')

# See model parameters
wine_lda.get_params()



In [14]:

pkl_filename = "../data/pickle_LDA_10t.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(wine_lda, file)


NameError: name 'wine_lda' is not defined

In [4]:
# RUN THIS TO LOAD A MODEL:
# # Load from file
# import pickle

# pkl_filename = "../data/pickle_LDA_10t.pkl"
# with open(pkl_filename, 'rb') as file:
#     wine_lda = pickle.load(file)
    
# wine_lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=8, n_jobs=-1,
                          perp_tol=0.1, random_state=0, topic_word_prior=0.125,
                          total_samples=1000000.0, verbose=0)

#### Try different number of topics


In [None]:
# Takes 15min !!
time0 = time.time()

all_ntopics = [4,5,6,7,8,20]

results = pd.DataFrame(all_ntopics, columns=['ntopics'], index=all_ntopics)

for ntopics in all_ntopics:
    lda = LatentDirichletAllocation(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )
    results.loc[ntopics, 'LDA'] = lda.fit(wine_tfidf)
    
    results.loc[ntopics, 'LL'] = results.loc[ntopics, 'LDA'].score(wine_tfidf)
    results.loc[ntopics, 'perplexity'] = results.loc[ntopics, 'LDA'].perplexity(wine_tfidf)

print('Done! It took', time.time()-time0, 'seconds.')


In [None]:
# overview of results:
results

In [16]:
# SAVE ALL THESE OUTPUTS !!!

for ntopics in all_ntopics:
    # Save to file:
    pkl_filename = '../data/pickle_LDA_' + str(ntopics) + 't.pkl'
    with open(pkl_filename, 'wb') as file:
        pickle.dump(results.loc[ntopics, 'LDA'], file)

../data/pickle_LDA_4t.pkl
../data/pickle_LDA_5t.pkl
../data/pickle_LDA_6t.pkl
../data/pickle_LDA_7t.pkl
../data/pickle_LDA_8t.pkl
../data/pickle_LDA_20t.pkl


In [None]:
# 6 topics:
pyLDAvis.sklearn.prepare(results.loc[6, 'LDA'], wine_tfidf, tfidf_vector, mds='PCoA') # or mds='mmds'

# Save to file:
pkl_filename = "../data/pickle_LDA_10t.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(wine_lda, file)


In [None]:
# 7 topics:
pyLDAvis.sklearn.prepare(results.loc[7, 'LDA'], wine_tfidf, tfidf_vector, mds='PCoA') # or mds='mmds'

In [None]:
# 8 topics:
pyLDAvis.sklearn.prepare(results.loc[8, 'LDA'], wine_tfidf, tfidf_vector, mds='PCoA') # or mds='mmds'

In [None]:
# 4 topics:
pyLDAvis.sklearn.prepare(results.loc[4, 'LDA'], wine_tfidf, tfidf_vector, mds='PCoA') # or mds='mmds'

In [None]:
# 5 topics:
pyLDAvis.sklearn.prepare(results.loc[5, 'LDA'], wine_tfidf, tfidf_vector, mds='PCoA') # or mds='mmds'

In [None]:
# 20 topics:
pyLDAvis.sklearn.prepare(results.loc[20, 'LDA'], wine_tfidf, tfidf_vector, mds='PCoA') # or mds='mmds'

## Build Features from Topics

## Similarity Matrix