# Testing Topic Modelling

This has to be done on full dataset. Let's see if my computer can handle it. Else we will move this to Google Colab.  

I first try to base the topics on the tf-idf vectors.  

In [169]:
# general:
import numpy as np
import pandas as pd

import time # will be use to choose the faster solution
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer # Normalize samples individually to unit norm.

# NLP:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD # LSA
from sklearn.decomposition import NMF # NMF



In [3]:
# load data:
raw_winedata = pd.read_csv("../data/winemag-data-190314.csv").drop("Unnamed: 0", axis=1) # indexes were stored as col
initial_number_of_rows = raw_winedata.shape[0]
print(raw_winedata.shape)
raw_winedata.head()


(141617, 14)


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,vintage
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013.0
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011.0
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013.0
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013.0
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012.0


In [35]:
# try to keep only varieties that occur more than n times:
n = 500
variety_counts = raw_winedata.variety.value_counts()
winedata = raw_winedata[raw_winedata.variety.isin(variety_counts.index[variety_counts.gt(n)])] # pandas.DataFrame.gt = get greater
print(winedata.shape)


(122856, 14)


**Even if we strongly limit the minimal number of occurences for a variety, we still get a large dataset, and computations will be slow on a laptop.**

In [180]:
domain_specific_stopwords = ['$', ' ', '’s', 'wine', 'winemaker', 'winemaking', 'winery', 
                            '2020–2030', '2–3', '3–4', '4–5', '5–6', '6–8']

# a list of all wine varieties. this is not perfect as some varieties are compound words. But it's a start.
variety_stopwords = winedata.variety.unique().tolist()
variety_stopwords = [str(variety).lower() for variety in variety_stopwords] # convert to lowercase
variety_stopwords.append(['pinot', 'cabernet', 'sauvignon', ]) # as frequent and only happens as compound word

In [181]:
# Updating our tokenizer function:
def tokenize_and_clean(description):

    # Tokenize:
    mytokens = nlp(description)

    # Removing stopwords, punctuation and convert to lower_case + AND NUMBERS (or use is_digit?)
    mytokens = [token.lower_ for token in mytokens if not token.is_punct and not token.is_stop and not token.like_num and not token.is_digit]
    
    # remove domain-specific stopwords:
    mytokens = [token for token in mytokens if token not in domain_specific_stopwords]
            
    # remove wine variety occurrences in the descriptions:
    mytokens = [token for token in mytokens if token not in variety_stopwords]
                
    # Return preprocessed list of tokens
    return mytokens

In [182]:
# We do not need word vectors here, so we can upload the small English model from spaCy:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])

In [183]:
time0 = time.time()

tfidf_vector = TfidfVectorizer(tokenizer = tokenize_and_clean, # using our custom tokenizer
                               ngram_range=(1,1),
                               max_df=0.95, # ignore t that have a df higher than max_df (corpus-specific stopwords)
                               min_df=10, # ignore terms that have a doc freq lower than threshold.
                               max_features=2000
                            )
# Applying the vectorizer:
wine_tfidf = tfidf_vector.fit_transform(winedata.description) # input: the column "description"

# Getting the word list.
terms = tfidf_vector.get_feature_names()

print('Done! it took', time.time()-time0, 'seconds.')

Done! it took 28.250622034072876 seconds.


## Trying Topic Modelling

In [212]:
# Number of topics:
ntopics=5


In [203]:
# Linking words to topics
def word_topic(tfidf, solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic = tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components = pd.DataFrame(words_by_topic, index=wordlist)
    
    return components

In [204]:
# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    
    index = np.repeat(n_topics, n_top_words, axis=0)
    topwords = pd.Series(index=index) # initiate a Series where to store the topwords of each topic
    fullist=[] # usual code doesn't work
    
    for column in n_topics:
        # Sort the column so that highest loadings are at the top.
        sortedwords = components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen = sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist = chosen.index + "  "+ round(chosen,2).map(str)
        
        fullist.append(chosenlist)
    
#     topwords = pd.Series(fullist, index=index)
        
#         topwords.loc[column] = chosenlist
    return(fullist) # (topwords)

In [205]:
# LSA

time0 = time.time()

svd = TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
wine_lsa = lsa.fit_transform(wine_tfidf)

components_lsa = word_topic(wine_tfidf, wine_lsa, terms)

print('Done! It took', time.time()-time0, 'seconds.')


Done! It took 0.5152602195739746 seconds.


In [223]:
# Number of words to look at for each topic.
n_top_words = 10

topwords = pd.DataFrame(index=range(0,ntopics))
topwords['LSA'] = top_words(components_lsa, n_top_words) 
for i in range(0,ntopics):
    print('topic', i, ':\n', topwords.LSA[i], '\n')

topic 0 :
 flavors    flavors  3896.74
fruit        fruit  3627.49
aromas       aromas  2837.8
acidity    acidity  2817.14
tannins    tannins  2737.64
palate      palate  2713.16
drink        drink  2676.99
black        black  2675.56
finish      finish  2661.19
cherry      cherry  2631.99
dtype: object 

topic 1 :
 acidity       acidity  1054.01
crisp            crisp  834.95
drink            drink  772.87
fruits          fruits  752.27
fruity          fruity  660.81
ripe              ripe  618.43
ready            ready  553.68
character    character  550.89
texture        texture  546.23
apple            apple  508.76
dtype: object 

topic 2 :
 tannins        tannins  970.25
black             black  861.6
fruits          fruits  634.69
drink            drink  575.21
firm              firm  500.08
rich              rich  464.41
structure    structure  421.49
wood              wood  401.93
ripe              ripe  400.19
aging            aging  399.55
dtype: object 

topic 3 :
 red     

In [225]:
# NNMF
time0 = time.time()

svd = TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
wine_lsa = lsa.fit_transform(wine_tfidf)

components_lsa = word_topic(wine_tfidf, wine_lsa, terms)


nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
wine_nmf = nmf.fit_transform(wine_tfidf) 

components_nmf = word_topic(wine_tfidf, wine_nmf, terms)

topwords['NMF'] = top_words(components_nmf, n_top_words) 
for i in range(0,ntopics):
    print('topic', i, ':\n', topwords.NMF[i], '\n')
    
print('Done! It took', time.time()-time0, 'seconds.')


topic 0 :
 flavors          flavors  77.92
fruit              fruit  58.52
finish            finish  52.88
oak                  oak  51.48
aromas            aromas  50.35
cherry            cherry  49.79
cabernet        cabernet  46.72
black              black  41.17
blackberry    blackberry  38.02
plum                plum  36.29
dtype: object 

topic 1 :
 ripe          ripe  81.47
tannins    tannins  79.08
drink        drink  77.45
rich          rich  74.38
fruits      fruits  71.77
fruit        fruit  68.17
black        black  63.33
acidity    acidity  58.46
wood          wood  51.16
flavors    flavors  46.23
dtype: object 

topic 2 :
 apple        apple  79.47
palate      palate  66.17
flavors    flavors  65.86
lemon        lemon  62.21
finish      finish  57.42
citrus      citrus  55.08
acidity    acidity  54.64
white        white  52.45
fruit        fruit  52.01
pear          pear  50.74
dtype: object 

topic 3 :
 black            black  90.27
cherry          cherry  84.56
palate  

In [230]:
for i in range(0,ntopics):
    print('topic', i, ':\nLSA:\n', topwords.LSA[i], '\nNMF:\n', topwords.NMF[i],'\n')


topic 0 :
LSA:
 flavors    flavors  3896.74
fruit        fruit  3627.49
aromas       aromas  2837.8
acidity    acidity  2817.14
tannins    tannins  2737.64
palate      palate  2713.16
drink        drink  2676.99
black        black  2675.56
finish      finish  2661.19
cherry      cherry  2631.99
dtype: object 
NMF:
 flavors          flavors  77.92
fruit              fruit  58.52
finish            finish  52.88
oak                  oak  51.48
aromas            aromas  50.35
cherry            cherry  49.79
cabernet        cabernet  46.72
black              black  41.17
blackberry    blackberry  38.02
plum                plum  36.29
dtype: object 

topic 1 :
LSA:
 acidity       acidity  1054.01
crisp            crisp  834.95
drink            drink  772.87
fruits          fruits  752.27
fruity          fruity  660.81
ripe              ripe  618.43
ready            ready  553.68
character    character  550.89
texture        texture  546.23
apple            apple  508.76
dtype: object 
NMF:
 

### Trying with 10 topics:

In [232]:
# Number of topics:
ntopics=10
topwords = pd.DataFrame(index=range(0,ntopics))


In [233]:
# LSA

time0 = time.time()

svd = TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
wine_lsa = lsa.fit_transform(wine_tfidf)

components_lsa = word_topic(wine_tfidf, wine_lsa, terms)

topwords['LSA'] = top_words(components_lsa, n_top_words) 

print('Done! It took', time.time()-time0, 'seconds.')


Done! It took 0.7150518894195557 seconds.


In [234]:
# NNMF
time0 = time.time()

svd = TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
wine_lsa = lsa.fit_transform(wine_tfidf)

components_lsa = word_topic(wine_tfidf, wine_lsa, terms)


nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
wine_nmf = nmf.fit_transform(wine_tfidf) 

components_nmf = word_topic(wine_tfidf, wine_nmf, terms)

topwords['NMF'] = top_words(components_nmf, n_top_words) 
    
print('Done! It took', time.time()-time0, 'seconds.')


Done! It took 6.698852300643921 seconds.


Print topics:


In [235]:
for i in range(0,ntopics):
    print('topic', i, ':\nLSA:\n', topwords.LSA[i], '\nNMF:\n', topwords.NMF[i],'\n')


topic 0 :
LSA:
 flavors    flavors  3331.71
fruit        fruit  3090.19
acidity    acidity  2500.82
aromas      aromas  2450.51
tannins    tannins  2409.47
palate       palate  2392.6
drink        drink  2385.18
black        black  2319.52
finish      finish  2296.57
cherry      cherry  2273.88
dtype: object 
NMF:
 ripe          ripe  53.79
fruits      fruits  53.57
drink        drink  52.68
tannins    tannins  48.09
rich          rich  47.53
acidity    acidity  39.37
fruit        fruit  32.68
wood           wood  32.6
aging        aging  30.37
black         black  30.3
dtype: object 

topic 1 :
LSA:
 acidity        acidity  965.86
crisp            crisp  765.52
drink            drink  714.06
fruits          fruits  703.13
fruity          fruity  609.78
ripe              ripe  569.69
ready            ready  508.68
character    character  503.28
texture        texture  502.83
apple            apple  469.45
dtype: object 
NMF:
 acidity    acidity  86.57
drink        drink  77.59
fruity  

In [236]:
import pyLDAvis