# Testing Topic Modelling

This has to be done on full dataset. Let's see if my computer can handle it. Else we will move this to Google Colab.  

I first try to base the topics on the tf-idf vectors.  

In [299]:
# general:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import time # will be use to choose the faster solution
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer # Normalize samples individually to unit norm.

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import sklearn

# NLP:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD # LSA
from sklearn.decomposition import NMF # NMF
from sklearn.decomposition import LatentDirichletAllocation # LDA




In [289]:
# load data:
raw_winedata = pd.read_csv("../data/winemag-data-190314.csv").drop("Unnamed: 0", axis=1) # indexes were stored as col
initial_number_of_rows = raw_winedata.shape[0]
print(raw_winedata.shape)
raw_winedata.head()


(141617, 14)


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,vintage
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013.0
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011.0
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013.0
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013.0
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012.0


In [290]:
# try to keep only varieties that occur more than n times:
n = 500
variety_counts = raw_winedata.variety.value_counts()
winedata = raw_winedata[raw_winedata.variety.isin(variety_counts.index[variety_counts.gt(n)])] # pandas.DataFrame.gt = get greater
print(winedata.shape)


(122856, 14)


**Even if we strongly limit the minimal number of occurences for a variety, we still get a large dataset, and computations will be slow on a laptop.**

In [291]:
domain_specific_stopwords = ['$', ' ', '’s', 'wine', 'winemaker', 'winemaking', 'winery', 
                            '2020–2030', '2–3', '3–4', '4–5', '5–6', '6–8']

# a list of all wine varieties. this is not perfect as some varieties are compound words. But it's a start.
variety_stopwords = winedata.variety.unique().tolist()
variety_stopwords = [str(variety).lower() for variety in variety_stopwords] # convert to lowercase
variety_stopwords.append(['pinot', 'cabernet', 'sauvignon', ]) # as frequent and only happens as compound word

In [292]:
# Updating our tokenizer function:
def tokenize_and_clean(description):

    # Tokenize:
    mytokens = nlp(description)

    # Removing stopwords, punctuation and convert to lower_case + AND NUMBERS (or use is_digit?)
    mytokens = [token.lower_ for token in mytokens if not token.is_punct and not token.is_stop and not token.like_num and not token.is_digit]
    
    # remove domain-specific stopwords:
    mytokens = [token for token in mytokens if token not in domain_specific_stopwords]
            
    # remove wine variety occurrences in the descriptions:
    mytokens = [token for token in mytokens if token not in variety_stopwords]
                
    # Return preprocessed list of tokens
    return mytokens

In [293]:
# We do not need word vectors here, so we can upload the small English model from spaCy:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])

In [294]:
time0 = time.time()

tfidf_vector = TfidfVectorizer(tokenizer = tokenize_and_clean, # using our custom tokenizer
                               ngram_range=(1,1),
                               max_df=0.95, # ignore t that have a df higher than max_df (corpus-specific stopwords)
                               min_df=10, # ignore terms that have a doc freq lower than threshold.
                               max_features=2000
                            )
# Applying the vectorizer:
wine_tfidf = tfidf_vector.fit_transform(winedata.description) # input: the column "description"

# Getting the word list.
terms = tfidf_vector.get_feature_names()

print('Done! it took', time.time()-time0, 'seconds.')

Done! it took 28.98251485824585 seconds.


## Trying Topic Modelling

In [212]:
# Number of topics:
ntopics=5


In [203]:
# Linking words to topics
def word_topic(tfidf, solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic = tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components = pd.DataFrame(words_by_topic, index=wordlist)
    
    return components

In [204]:
# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    
    index = np.repeat(n_topics, n_top_words, axis=0)
    topwords = pd.Series(index=index) # initiate a Series where to store the topwords of each topic
    fullist=[] # usual code doesn't work
    
    for column in n_topics:
        # Sort the column so that highest loadings are at the top.
        sortedwords = components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen = sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist = chosen.index + "  "+ round(chosen,2).map(str)
        
        fullist.append(chosenlist)
    
#     topwords = pd.Series(fullist, index=index)
        
#         topwords.loc[column] = chosenlist
    return(fullist) # (topwords)

In [205]:
# LSA

time0 = time.time()

svd = TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
wine_lsa = lsa.fit_transform(wine_tfidf)

components_lsa = word_topic(wine_tfidf, wine_lsa, terms)

print('Done! It took', time.time()-time0, 'seconds.')


Done! It took 0.5152602195739746 seconds.


In [223]:
# Number of words to look at for each topic.
n_top_words = 10

topwords = pd.DataFrame(index=range(0,ntopics))
topwords['LSA'] = top_words(components_lsa, n_top_words) 
for i in range(0,ntopics):
    print('topic', i, ':\n', topwords.LSA[i], '\n')

topic 0 :
 flavors    flavors  3896.74
fruit        fruit  3627.49
aromas       aromas  2837.8
acidity    acidity  2817.14
tannins    tannins  2737.64
palate      palate  2713.16
drink        drink  2676.99
black        black  2675.56
finish      finish  2661.19
cherry      cherry  2631.99
dtype: object 

topic 1 :
 acidity       acidity  1054.01
crisp            crisp  834.95
drink            drink  772.87
fruits          fruits  752.27
fruity          fruity  660.81
ripe              ripe  618.43
ready            ready  553.68
character    character  550.89
texture        texture  546.23
apple            apple  508.76
dtype: object 

topic 2 :
 tannins        tannins  970.25
black             black  861.6
fruits          fruits  634.69
drink            drink  575.21
firm              firm  500.08
rich              rich  464.41
structure    structure  421.49
wood              wood  401.93
ripe              ripe  400.19
aging            aging  399.55
dtype: object 

topic 3 :
 red     

In [225]:
# NNMF
time0 = time.time()

svd = TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
wine_lsa = lsa.fit_transform(wine_tfidf)

components_lsa = word_topic(wine_tfidf, wine_lsa, terms)


nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
wine_nmf = nmf.fit_transform(wine_tfidf) 

components_nmf = word_topic(wine_tfidf, wine_nmf, terms)

topwords['NMF'] = top_words(components_nmf, n_top_words) 
for i in range(0,ntopics):
    print('topic', i, ':\n', topwords.NMF[i], '\n')
    
print('Done! It took', time.time()-time0, 'seconds.')


topic 0 :
 flavors          flavors  77.92
fruit              fruit  58.52
finish            finish  52.88
oak                  oak  51.48
aromas            aromas  50.35
cherry            cherry  49.79
cabernet        cabernet  46.72
black              black  41.17
blackberry    blackberry  38.02
plum                plum  36.29
dtype: object 

topic 1 :
 ripe          ripe  81.47
tannins    tannins  79.08
drink        drink  77.45
rich          rich  74.38
fruits      fruits  71.77
fruit        fruit  68.17
black        black  63.33
acidity    acidity  58.46
wood          wood  51.16
flavors    flavors  46.23
dtype: object 

topic 2 :
 apple        apple  79.47
palate      palate  66.17
flavors    flavors  65.86
lemon        lemon  62.21
finish      finish  57.42
citrus      citrus  55.08
acidity    acidity  54.64
white        white  52.45
fruit        fruit  52.01
pear          pear  50.74
dtype: object 

topic 3 :
 black            black  90.27
cherry          cherry  84.56
palate  

In [230]:
for i in range(0,ntopics):
    print('topic', i, ':\nLSA:\n', topwords.LSA[i], '\nNMF:\n', topwords.NMF[i],'\n')


topic 0 :
LSA:
 flavors    flavors  3896.74
fruit        fruit  3627.49
aromas       aromas  2837.8
acidity    acidity  2817.14
tannins    tannins  2737.64
palate      palate  2713.16
drink        drink  2676.99
black        black  2675.56
finish      finish  2661.19
cherry      cherry  2631.99
dtype: object 
NMF:
 flavors          flavors  77.92
fruit              fruit  58.52
finish            finish  52.88
oak                  oak  51.48
aromas            aromas  50.35
cherry            cherry  49.79
cabernet        cabernet  46.72
black              black  41.17
blackberry    blackberry  38.02
plum                plum  36.29
dtype: object 

topic 1 :
LSA:
 acidity       acidity  1054.01
crisp            crisp  834.95
drink            drink  772.87
fruits          fruits  752.27
fruity          fruity  660.81
ripe              ripe  618.43
ready            ready  553.68
character    character  550.89
texture        texture  546.23
apple            apple  508.76
dtype: object 
NMF:
 

### Trying with 10 topics:

In [232]:
# Number of topics:
ntopics=10
topwords = pd.DataFrame(index=range(0,ntopics))


In [233]:
# LSA

time0 = time.time()

svd = TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
wine_lsa = lsa.fit_transform(wine_tfidf)

components_lsa = word_topic(wine_tfidf, wine_lsa, terms)

topwords['LSA'] = top_words(components_lsa, n_top_words) 

print('Done! It took', time.time()-time0, 'seconds.')


Done! It took 0.7150518894195557 seconds.


In [234]:
# NNMF
time0 = time.time()

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
wine_nmf = nmf.fit_transform(wine_tfidf) 

components_nmf = word_topic(wine_tfidf, wine_nmf, terms)

topwords['NMF'] = top_words(components_nmf, n_top_words) 
    
print('Done! It took', time.time()-time0, 'seconds.')


Done! It took 6.698852300643921 seconds.


In [272]:
# LDA

time0 = time.time()

lda = LatentDirichletAllocation(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )
wine_lda = lda.fit(wine_tfidf)

print('Done! It took', time.time()-time0, 'seconds.')


TypeError: unsupported operand type(s) for *: 'csc_matrix' and 'LatentDirichletAllocation'

In [247]:
components_lda = word_topic(wine_tfidf, wine_lda, terms)
# Linking words to topics
def word_topic(tfidf, solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic = tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components = pd.DataFrame(words_by_topic, index=wordlist)
    
    return components

words_by_topic = wine_tfidf.T * [wine_lda]

TypeError: unsupported operand type(s) for *: 'csc_matrix' and 'LatentDirichletAllocation'

In [278]:
wine_nmf.components_

AttributeError: 'numpy.ndarray' object has no attribute 'components_'

Print topics:


In [302]:
# def display_topics(model, feature_names, no_top_words):
#     for topic_idx, topic in enumerate(model.components_):
#         print("Topic:", topic_idx, " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# no_top_words = 10
# print('LDA: ')
# display_topics(lda, terms, n_top_words)
# print('\nNMF: ')
# display_topics(nmf, terms, n_top_words)
# print('\nLDA: ')
# display_topics(lsa, terms, n_top_words)

LDA: 
Topic: 0 flavors fruit sweet peach light aromas apple citrus white pineapple
Topic: 1 tannins palate alongside aromas cherry black offers berry red opens
Topic: 2 cabernet sauvignon flavors fruit blend tannins oak petit franc verdot
Topic: 3 nose black cherry aromas palate bottling spice fruit dried red
Topic: 4 aromas flavors berry finish plum blackberry oak palate black fruit
Topic: 5 flavors finish aromas green herbal berry palate plum feels fruit
Topic: 6 flavors cherry pinot bodied cherries tannins drink dry soft sweet
Topic: 7 lemon apple palate lime pear nose finish peach acidity citrus
Topic: 8 drink acidity fruits ripe tannins rich fruit character fruity wood
Topic: 9 fruit vineyard cherry black oak bodied finish flavors tannin red

NMF: 
Topic: 0 fruits ripe rich drink tannins aging wood structure firm age
Topic: 1 fruity acidity crisp drink light ready red soft bright attractive
Topic: 2 apple lemon white pear citrus palate peach lime nose green
Topic: 3 cherry palate 

AttributeError: 'Pipeline' object has no attribute 'components_'

In [235]:
for i in range(0, ntopics):
    print('topic', i, ':\nLSA:\n', topwords.LSA[i], '\nNMF:\n', topwords.NMF[i],'\nLDA:\n', topwords.LDA[i],'\n')


topic 0 :
LSA:
 flavors    flavors  3331.71
fruit        fruit  3090.19
acidity    acidity  2500.82
aromas      aromas  2450.51
tannins    tannins  2409.47
palate       palate  2392.6
drink        drink  2385.18
black        black  2319.52
finish      finish  2296.57
cherry      cherry  2273.88
dtype: object 
NMF:
 ripe          ripe  53.79
fruits      fruits  53.57
drink        drink  52.68
tannins    tannins  48.09
rich          rich  47.53
acidity    acidity  39.37
fruit        fruit  32.68
wood           wood  32.6
aging        aging  30.37
black         black  30.3
dtype: object 

topic 1 :
LSA:
 acidity        acidity  965.86
crisp            crisp  765.52
drink            drink  714.06
fruits          fruits  703.13
fruity          fruity  609.78
ripe              ripe  569.69
ready            ready  508.68
character    character  503.28
texture        texture  502.83
apple            apple  469.45
dtype: object 
NMF:
 acidity    acidity  86.57
drink        drink  77.59
fruity  

## pyLDAvis

A good topic model will have non-overlapping, fairly big sized blobs for each topic.

In [301]:
pyLDAvis.sklearn.prepare(wine_lda, wine_tfidf, tfidf_vector, mds='mmds', sort=True) # try also mds='tsne'

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


**Other keywords to consider removing when cleaning test:**   
cabernet, cabernets, château, village, beaujolais, domaine, côte, saint, village, parcel, parcels, douro, nacional, I, widely, muscat, grigio, now-2015, now-2018, now-2025, willamette, sirah, bordeaux, pommard, rioja, barbaresco, chianti

In [307]:
time0 = time.time()

# Log Likelyhood: Higher the better
print("Log Likelihood: ", wine_lda.score(wine_tfidf))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", wine_lda.perplexity(wine_tfidf))

print('Done! it took', time.time()-time0, 'sec.')

# See model parameters
wine_lda.get_params()



Log Likelihood:  -3808820.7382666515
Perplexity:  1627.063764054897
Done! it took 23.39322590827942 sec.


{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'batch',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 10,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 0,
 'topic_word_prior': 0.1,
 'total_samples': 1000000.0,
 'verbose': 0}

## Try different number of topics



In [332]:
# Takes 15min !!
time0 = time.time()

all_ntopics = [5,8,10,15,20]
# all_ntopics = [5]

results = pd.DataFrame(all_ntopics, columns=['ntopics'], index=all_ntopics)

for ntopics in all_ntopics:
    lda = LatentDirichletAllocation(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )
    wine_lda = lda.fit(wine_tfidf)
    
    results.loc[ntopics, 'LL'] = wine_lda.score(wine_tfidf)
    results.loc[ntopics, 'perplexity'] = wine_lda.perplexity(wine_tfidf)

print('Done! It took', time.time()-time0, 'seconds.')


Done! It took 614.6727170944214 seconds.


In [334]:
results

Unnamed: 0,ntopics,LL,perplexity
5,5,-3741020.0,1426.390323
8,8,-3786674.0,1558.59007
10,10,-3808821.0,1627.063764
15,15,-3854569.0,1778.184661
20,20,-3902305.0,1950.859275


In [None]:
# Takes 15min !!
time0 = time.time()

all_ntopics = [3,4,5,6,7,8]

results = pd.DataFrame(all_ntopics, columns=['ntopics'], index=all_ntopics)

for ntopics in all_ntopics:
    lda = LatentDirichletAllocation(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )
    results.loc[ntopics, 'LDA'] = lda.fit(wine_tfidf)
    
    results.loc[ntopics, 'LL'] = wine_lda.score(wine_tfidf)
    results.loc[ntopics, 'perplexity'] = wine_lda.perplexity(wine_tfidf)

print('Done! It took', time.time()-time0, 'seconds.')


In [333]:
results

Unnamed: 0,ntopics,LL,perplexity
5,5,-3741020.0,1426.390323
8,8,-3786674.0,1558.59007
10,10,-3808821.0,1627.063764
15,15,-3854569.0,1778.184661
20,20,-3902305.0,1950.859275


In [None]:
pyLDAvis.sklearn.prepare(wine_lda, wine_tfidf, tfidf_vector, mds='mmds', sort=True)

## Coherence of Model

Largest coherence -> good number of features.

## Build Features from Topics

## Similarity Matrix