In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
# import textblob
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import string
import re
import spacy
import gensim
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../topic_data.csv')

In [3]:
df_copy = df.copy()

In [4]:
df_copy.describe()

Unnamed: 0,year,month
count,17640.0,17640.0
mean,2021.114172,6.060488
std,0.814506,3.19634
min,2018.0,1.0
25%,2021.0,3.0
50%,2021.0,6.0
75%,2022.0,9.0
max,2022.0,12.0


In [5]:
df_copy.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17640 entries, 0 to 17639
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   the_review  17640 non-null  object
 1   year        17640 non-null  int64 
 2   month       17640 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.9 MB


**Memory management**

In [6]:
# Memory management
def memory_magment(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != np.object:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8) # int 8
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16) # int 16
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)  # in 32
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64) # int 64 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else: 
                    df[col] = df[col].astype(np.float64)
    return df
df_memory = df_copy.copy()
df_memory = memory_magment(df_memory)    

In [7]:
df_memory.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17640 entries, 0 to 17639
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   the_review  17640 non-null  object
 1   year        17640 non-null  int16 
 2   month       17640 non-null  int8  
dtypes: int16(1), int8(1), object(1)
memory usage: 4.7 MB


Memory reduced from 3.2 --> 3.1 MB

**Preprocessing the data to obtain the tokens.**

- The cleaned_text version will be made use of for this process.

In [8]:
df_memory.sample(n=2,random_state=42)

Unnamed: 0,the_review,year,month
828,It turns on but the left button does not work,2022,6
1097,We bought this for my son to use and it lasted...,2022,5


In [10]:
df_memory.year.value_counts()

2021    8785
2022    5940
2020    1926
2019     967
2018      22
Name: year, dtype: int64

Based of above cell
year| document count
| --- | ----------- |
2021   | 8785
2022   | 5940
2020   | 1926
2019   |  967
2018   |   22

In [12]:
df_memory.month.value_counts()

3     2475
7     1772
4     1717
8     1696
5     1677
6     1376
10    1351
1     1309
9     1308
2     1126
12     995
11     838
Name: month, dtype: int64

Based of above cell
month| document count
| --- | ----------- |
3  |   2475
7  |   1772
4  |   1717
8  |   1696
5  |   1677
6  |   1376
10 |   1351
1  |   1309
9  |   1308
2  |   1126
12 |    995
11 |    838

**Preprocess the reviews**

In [13]:
df_memory.head()

Unnamed: 0,the_review,year,month
0,I wish I would have gotten one earlier. I love...,2022,10
1,I've learned this lesson (again). Open the pac...,2022,10
2,It is so slow and lags find a better option,2022,10
3,Roller ball stopped working within 4 months of...,2022,10
4,I like the color and size but it’s a few days ...,2022,10


In [14]:
def preprocess(sent):
    '''Cleans text data up, leaving only 2 or
        more char long non-stopwords composed of A-Z & a-z only
        in lowercase'''
    # lowercase
    sentence = sent.lower()

    # Remove RT
    sentence = re.sub('RT @\w+: '," ",sentence)

    # Remove special characters
    sentence = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", sentence)

    # Removing digits
    sentence = sentence.translate(str.maketrans('', '', string.digits))

    # Removing puntuactions
    # sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  
    # When we remove apostrophe from the word "Mark's", 
    # the apostrophe is replaced by an empty space. 
    # Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  
    # Next, we remove all the single characters and replace it by a space 
    # which creates multiple spaces in our text. 
    # Finally, we remove the multiple spaces from our text as well.

    return sentence


In [51]:
featured_data = df_memory.copy()

In [52]:
featured_data['cleaned_text'] = featured_data["the_review"].apply(preprocess)
featured_data.drop(columns='the_review',inplace=True)

In [53]:
featured_data.head()

Unnamed: 0,year,month,cleaned_text
0,2022,10,i wish would have gotten one earlier love it a...
1,2022,10,i ve learned this lesson again open the packag...
2,2022,10,it is so slow and lags find better option
3,2022,10,roller ball stopped working within months of m...
4,2022,10,i like the color and size but it few days out ...


In [54]:
featured_data = featured_data[['cleaned_text','month','year']]

In [55]:
featured_data

Unnamed: 0,cleaned_text,month,year
0,i wish would have gotten one earlier love it a...,10,2022
1,i ve learned this lesson again open the packag...,10,2022
2,it is so slow and lags find better option,10,2022
3,roller ball stopped working within months of m...,10,2022
4,i like the color and size but it few days out ...,10,2022
...,...,...,...
17635,much more than expected sound is awesome and b...,11,2018
17636,good quality,11,2018
17637,ok,10,2018
17638,speaker has clean sound just not loud enough t...,10,2018


In [56]:
from spacy.lang.en.stop_words import STOP_WORDS

all_stopwords = {"'d","'ll","'m","'re","'s","'ve",'a','about',
'above','across','after','afterwards','again','all','almost','alone','along',
'already','also','although','always','am','among','amongst','amount','an','and',
'another','any','anyhow','anyone','anything','anyway','anywhere','are','around',
'as','at','back','be','became','because','become','becomes','becoming','been','before',
'beforehand','behind','being','below','beside','besides','between','both','bottom',
'but','by','ca','call','can','could','did','do','does','doing','done','down','due','during','each',
'eight','either','eleven','else','elsewhere','empty','even','everyone','everything',
'everywhere','except','few','fifteen','fifty','first','five','for','former','formerly','forty','four','from','front',
'full','further','go','had','has','have','he','hence','her','here','hereafter','hereby','herein','hereupon','hers',
'herself','him','himself','his','how','however','hundred','i','if','in','indeed','into','is','it','its','itself','just','keep','last',
'latter','latterly','made','make','many','may','me','meanwhile','might','mine','more','moreover','move','much',
'must','my','myself','name','namely','neither','nevertheless','next','nine','nobody','noone','nothing','now','nowhere','of','often',
'on','once','one','only','onto','or','other','others','otherwise','our','ours','ourselves','out','own','part','per','perhaps','please','put',
'rather','re','regarding','same','say','see','several','she','should','show','side',
'since','six','sixty','so','some','somehow','someone','something','sometime','sometimes','somewhere','still','such','take','ten','than','that','the','their',
'them','themselves','then','thence','there','thereafter','thereby','therefore','therein','thereupon','these','they','third','this','those','though','three',
'through','throughout','thru','thus','to','together','top','toward','towards','twelve','twenty','two','under','unless','until','up','upon','us','used','using',
'various','via','was','we','well','were','what','whatever','when','whence','whenever','where','whereafter','whereas','whereby','wherein','whereupon',
'wherever','whether','which','while','whither','who','whoever','whole','whom','whose','why','will','with','within','would','yet','you','your','yours','yourself',
'yourselves','‘d','‘ll','‘m','‘re','‘s','‘ve','’d','’ll','’m','’re','’s','’ve'}

my_stop_words = set(all_stopwords) # My own stop words

In [57]:
df_memory.the_review[0]

'I wish I would have gotten one earlier. I love it and it makes working in my laptop so much easier'

In [58]:
featured_data.cleaned_text[0]

'i wish would have gotten one earlier love it and it makes working in my laptop so much easier'

In [59]:
nlp = spacy.load("en_core_web_sm")

def spacy_tokeniser(sent):
    sent = sent.strip().lower()
    doc = nlp(sent)
    mytokens = [token.lemma_ for token in doc if token.text not in my_stop_words]
    return mytokens

featured_data['tokens'] = featured_data['cleaned_text'].apply(spacy_tokeniser)

In [61]:
# Compute bigrams
from gensim.models import Phrases
from gensim.models.phrases import Phraser,ENGLISH_CONNECTOR_WORDS

In [62]:
# https://stackoverflow.com/questions/56909294/how-to-set-time-slices-dynamic-topic-model
# You must order from oldest date to newest date
featured_data_reoder = featured_data.sort_values(by='year',ascending=True) 

In [65]:
featured_data_reoder= featured_data_reoder.reset_index()

In [67]:
featured_data_reoder.head()

Unnamed: 0,index,cleaned_text,month,year,tokens
0,17639,charge is perfect someone says the issues abou...,10,2018,"[charge, perfect, say, issue, mono, speaker, t..."
1,17581,nice speaker,12,2018,"[nice, speaker]"
2,17638,speaker has clean sound just not loud enough t...,10,2018,"[speaker, clean, sound, not, loud, enough, jus..."
3,17600,it perfect,12,2018,[perfect]
4,17621,great product,12,2018,"[great, product]"


In [68]:
featured_data_reoder.drop('index',inplace=True,axis=1)

In [70]:
featured_data_reoder.head()

Unnamed: 0,cleaned_text,month,year,tokens
0,charge is perfect someone says the issues abou...,10,2018,"[charge, perfect, say, issue, mono, speaker, t..."
1,nice speaker,12,2018,"[nice, speaker]"
2,speaker has clean sound just not loud enough t...,10,2018,"[speaker, clean, sound, not, loud, enough, jus..."
3,it perfect,12,2018,[perfect]
4,great product,12,2018,"[great, product]"


In [71]:
docs = featured_data_reoder['tokens'].tolist()

In [75]:
" ".join(docs[0])

'charge perfect say issue mono speaker think high quality generation cause speaker close stereo not accurate step think good idea buy way usa korea take day ship great'

In [77]:
docs[0][:8] # from document 1

['charge', 'perfect', 'say', 'issue', 'mono', 'speaker', 'think', 'high']

In [76]:
# Add bigrams to docs (only ones that appear 20 times or more).
# https://stackoverflow.com/questions/35716121/how-to-extract-phrases-from-corpus-using-gensim
# https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.original_scorer
# This is usisng the default scorer and not the robust scorer; npmi
bigram_phrases = Phrases(docs, min_count=20,connector_words=ENGLISH_CONNECTOR_WORDS)

In [78]:
bigram = Phraser(bigram_phrases)

In [79]:
def make_bigram(texts):
    return([bigram[doc] for doc in texts])

In [80]:
bigrams_docs = make_bigram(docs)

In [81]:
print(bigrams_docs[200][:])

['great', 'sound_quality', 'good', 'battery_life', 'easy', 'connect', 'little', 'heavy', 'big', 'expect']


In [82]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
id2word = Dictionary(bigrams_docs)
# or 
# dictionary = Dictionary(docs)


# Filter out words that occur less than 20 documents, or more than 50% of the documents.
# https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.filter_extremes
id2word.filter_extremes(no_below=20, no_above=0.5)

In [84]:
# id2word.most_common(20) # most common words
# [('work', 7677),
#  ('mouse', 7070),
#  ('not', 5801),
#  ('great', 4663),
#  ('charge', 3779),
#  ('use', 3696),
#  ('good', 3430),
#  ('love', 3246),
#  ('very', 3130),
#  ('buy', 2927),
#  ('like', 2902),
#  ('get', 2623),
#  ('sound', 2559),
#  ('light', 2452),
#  ('product', 1831),
#  ('keyboard', 1824),
#  ('time', 1816),
#  ('speaker', 1751),
#  ('really', 1595),
#  ('stop', 1552)]

In [85]:
# Bag-of-words representation of the documents.
# Term document frequency 
bow_corpus = [id2word.doc2bow(doc) for doc in bigrams_docs]

In [86]:
# Create corpus
texts = bigrams_docs

In [87]:
print(f'Number of unique tokens: { len(id2word)}')
print(f'Number of documents: {len(bow_corpus)}')

Number of unique tokens: 1266
Number of documents: 17640


In [94]:
# print(df_memory.loc[200,'the_review'])
print(featured_data_reoder.loc[200,"cleaned_text"],"\n")
print(featured_data_reoder.loc[200,"tokens"])

great sound quality good battery life easy to connect little heavier and bigger than was expecting  

['great', 'sound', 'quality', 'good', 'battery', 'life', 'easy', 'connect', 'little', 'heavy', 'big', 'expect']


In [143]:
bow_corpus[:]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 2),
  (22, 1)],
 [(17, 1), (23, 1)],
 [(1, 1),
  (3, 1),
  (12, 1),
  (17, 2),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 2),
  (34, 1)],
 [(13, 1)],
 [(7, 1), (35, 1)],
 [(6, 1), (36, 1), (37, 1), (38, 1)],
 [(6, 1), (39, 1)],
 [(12, 1),
  (19, 1),
  (40, 1),
  (41, 1),
  (42, 2),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1)],
 [(7, 2), (17, 1), (33, 1), (50, 1), (51, 1)],
 [(7, 1), (17, 1), (20, 1), (33, 1), (35, 1), (52, 1), (53, 1), (54, 1)],
 [(7, 1), (17, 1), (33, 1), (55, 1), (56, 1), (57, 1)],
 [(48, 1), (58, 1), (59, 1), (60, 1), (61, 1)],
 [(6, 1), (33, 1), (62, 1)],
 [(6, 1), (33, 1), (63, 1)],
 [(12, 1),
  (29, 1),
  (32, 1),
  (43, 1),
  (48, 1),
  (50, 1),

In [95]:
# Verifying BOW is set up correctly
print(f" Cleaned Text: \n{featured_data_reoder.loc[200,'cleaned_text']}")
print(f"Bow representation: {bow_corpus[200]}")

document_200 = bow_corpus[200]
for i in range(len(document_200)):
    print(f"Word {document_200[i][0]}, {id2word[document_200[i][0]]}, appears {document_200[i][1]}")

 Cleaned Text: 
great sound quality good battery life easy to connect little heavier and bigger than was expecting 
Bow representation: [(6, 1), (7, 1), (46, 1), (50, 1), (86, 1), (96, 1), (106, 1), (180, 1), (182, 1), (233, 1)]
Word 6, good, appears 1
Word 7, great, appears 1
Word 46, sound_quality, appears 1
Word 50, battery_life, appears 1
Word 86, heavy, appears 1
Word 96, expect, appears 1
Word 106, little, appears 1
Word 180, connect, appears 1
Word 182, easy, appears 1
Word 233, big, appears 1


### **Dynamic Topic Models**

In [96]:
from gensim import models
from gensim.matutils import hellinger
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [98]:
time_slice_months = [22,967,1926,8785,5940]
            #2018,2019,2020,2021,2022 (5 years)

In [99]:
ldaseq = models.ldaseqmodel.LdaSeqModel(corpus=bow_corpus,
                                    id2word=id2word,
                                    time_slice=time_slice_months,
                                    num_topics=6,
                                    random_state=42,
                                    chunksize=2000,
                                    passes=50)

### **Topics Per Time** 

In [132]:
# ldaseq.print_topics(time="1")

In [190]:
#2018,2019,2020,2021,2022 (5 years)
# [0,1,2,3,4]
def print_topics_per_time(time:int,model):
        try:
            print(model.print_topics(time=time))
        except IndexError:
            print("Please choose a time slice from 0 - 4 and must be an integer")
        

For each year there are 6 topics.

In [133]:
# 2018
print_topics_per_time(0,ldaseq) 

[[('good', 0.0498788991533739), ('not', 0.027758707381998506), ('headset', 0.026890036602387694), ('mic', 0.0201726826320126), ('sound', 0.01961138032707135), ('headphone', 0.018502892654876344), ('like', 0.015600571879981259), ('quality', 0.015325797624711293), ('work', 0.015031963250048448), ('hear', 0.014763619507915632), ('get', 0.01415322226530193), ('price', 0.012101055294568577), ('really', 0.011936912259624139), ('sound_quality', 0.01133695557057364), ('very', 0.011296090218946906), ('game', 0.01058950256025348), ('gaming', 0.010268941466613389), ('great', 0.00936829189253216), ('ear', 0.008363085079197832), ('buy', 0.007729549551242613)], [('speaker', 0.057761425033555865), ('sound', 0.05046284108546137), ('great', 0.030407362118514265), ('jbl', 0.02254143996303447), ('good', 0.0190946623960152), ('charge', 0.017798822616979005), ('not', 0.014101156797631086), ('get', 0.013608693875200499), ('bass', 0.013372068408882094), ('battery_life', 0.012396609636827358), ('sound_quality

**Oberservations**

For 2018, this is what it says:
- Topic 1 could be talking about headset quality
- Topic 2 could be talking about speaker quality
- Topic 3 could be talking about good quality
- Topic 4 could be talking about mouse and keyboard quality
- Topic 5 & 6 could be talking about bad quality

Let's take topic 1 as our case study and see the position of the **word 'headset' and its word probability: ('headset', 0.026890036602387694) at the third index**, as the topics per time seems to be the same.

In [134]:
# 2019
print_topics_per_time(1,ldaseq)

[[('good', 0.04941575485727385), ('not', 0.02793539978608407), ('headset', 0.027725140989623157), ('mic', 0.020366245473751366), ('sound', 0.01956776042302582), ('headphone', 0.018655948544802268), ('like', 0.01569490070695153), ('quality', 0.015372459940964589), ('work', 0.015158932815920458), ('hear', 0.01484443403201802), ('get', 0.01382090143042339), ('price', 0.01215960334008339), ('really', 0.011998622557107704), ('sound_quality', 0.011360172117367815), ('very', 0.011319974702202402), ('game', 0.010629773487696382), ('gaming', 0.010266917419706275), ('great', 0.009462173472842709), ('ear', 0.008411734043678044), ('buy', 0.007719258164627842)], [('speaker', 0.05812040351868785), ('sound', 0.04992416789180153), ('great', 0.029935348511496963), ('jbl', 0.022185304906792217), ('good', 0.019283898172256137), ('charge', 0.017869404479163783), ('not', 0.014203163995177525), ('get', 0.01376165722783174), ('bass', 0.01343425039551553), ('battery_life', 0.01252502396922994), ('sound_qualit

('headset', 0.027725140989623157) at the third index

In [135]:
# 2020
print_topics_per_time(2,ldaseq)

[[('good', 0.04667723829242252), ('headset', 0.03084720026889697), ('not', 0.028153299888588935), ('mic', 0.020675687445811497), ('sound', 0.01961131716380999), ('headphone', 0.018890778512947373), ('like', 0.01582432242111672), ('quality', 0.015366479983741231), ('work', 0.01534878025548828), ('hear', 0.014945613574349231), ('get', 0.013236116031100012), ('price', 0.012225144982352403), ('really', 0.01206928747683193), ('sound_quality', 0.011457371935234998), ('very', 0.011444127513254838), ('game', 0.01069093579431219), ('gaming', 0.010367562384337909), ('great', 0.009598679361198128), ('ear', 0.008559295776167321), ('buy', 0.007733139661244043)], [('speaker', 0.05832586295233017), ('sound', 0.049491655539596745), ('great', 0.029826251678749766), ('jbl', 0.021160083582891353), ('good', 0.01954681899245944), ('charge', 0.017868146335825275), ('not', 0.014305118655470008), ('get', 0.013996635081998787), ('bass', 0.013458156622302606), ('battery_life', 0.012723542379660932), ('sound_qua

('headset', 0.03084720026889697) at the second index

In [136]:
# 2021
print_topics_per_time(3,ldaseq)

[[('good', 0.04078943171263382), ('headset', 0.03534976873805569), ('not', 0.028407318662164033), ('mic', 0.021056866711174575), ('sound', 0.020007036058506865), ('headphone', 0.019173423571494697), ('like', 0.015970871606806308), ('work', 0.015574734908086701), ('hear', 0.015063292909783033), ('quality', 0.014898445220778914), ('get', 0.012903345595267292), ('price', 0.012290514462763817), ('really', 0.01213557208720279), ('very', 0.011720156317712461), ('sound_quality', 0.011675253578718861), ('game', 0.010887318684547752), ('gaming', 0.010630094928568956), ('great', 0.009748732396872397), ('ear', 0.008713267995395648), ('light', 0.007863722233793906)], [('speaker', 0.058048601902574985), ('sound', 0.05128850674194787), ('great', 0.03138814140002231), ('good', 0.019760782695832785), ('jbl', 0.019435026994596215), ('charge', 0.01772674780462738), ('not', 0.014317120558987873), ('get', 0.01417684732955767), ('bass', 0.01338976783208722), ('battery_life', 0.012895422653187133), ('loud',

('headset', 0.03534976873805569) at the second index

In [137]:
# 2022
print_topics_per_time(4,ldaseq)

[[('good', 0.03135313629736879), ('headset', 0.030341887807874435), ('not', 0.02898668607061662), ('mic', 0.021576892852967548), ('sound', 0.02111424343830064), ('headphone', 0.019585988311989634), ('like', 0.016278843396790164), ('work', 0.015935572665753777), ('hear', 0.015351214918859359), ('quality', 0.013723168864934465), ('get', 0.013651731185640314), ('price', 0.01246641277513113), ('really', 0.012300667372796661), ('very', 0.011681313655710162), ('game', 0.010983067345346367), ('sound_quality', 0.010859402749152538), ('gaming', 0.010549456779571056), ('great', 0.00995110728723512), ('ear', 0.008799905112466788), ('light', 0.008212368785557542)], [('speaker', 0.05802605005341413), ('sound', 0.05100611809754508), ('great', 0.03199687276922273), ('good', 0.019923923430691787), ('jbl', 0.01874894218583663), ('charge', 0.0177343205551656), ('not', 0.014349604832124956), ('get', 0.014319916246719024), ('bass', 0.01332536611922799), ('battery_life', 0.013038256532144443), ('loud', 0.0

('headset', 0.030341887807874435) at the second index

**Observations**
- The word probability of headset increased overtime from 2020.

### **Topics Over Time** 

In [141]:
# There 6 topics; 0-5
ldaseq.print_topic_times(topic=0)

[[('good', 0.0498788991533739),
  ('not', 0.027758707381998506),
  ('headset', 0.026890036602387694),
  ('mic', 0.0201726826320126),
  ('sound', 0.01961138032707135),
  ('headphone', 0.018502892654876344),
  ('like', 0.015600571879981259),
  ('quality', 0.015325797624711293),
  ('work', 0.015031963250048448),
  ('hear', 0.014763619507915632),
  ('get', 0.01415322226530193),
  ('price', 0.012101055294568577),
  ('really', 0.011936912259624139),
  ('sound_quality', 0.01133695557057364),
  ('very', 0.011296090218946906),
  ('game', 0.01058950256025348),
  ('gaming', 0.010268941466613389),
  ('great', 0.00936829189253216),
  ('ear', 0.008363085079197832),
  ('buy', 0.007729549551242613)],
 [('good', 0.04941575485727385),
  ('not', 0.02793539978608407),
  ('headset', 0.027725140989623157),
  ('mic', 0.020366245473751366),
  ('sound', 0.01956776042302582),
  ('headphone', 0.018655948544802268),
  ('like', 0.01569490070695153),
  ('quality', 0.015372459940964589),
  ('work', 0.015158932815920

In [186]:
# 6 topics: [0,1,2,3,4,5]
def print_topics_over_time(topic:int,model):
        try:
            print(model.print_topic_times(topic=topic))
        except IndexError:
            print("Please choose a topic number from 0 - 5 and must be an integer")
        

In [142]:
# Looking into topic one's evolution over time.
print_topics_over_time(0,ldaseq)

[[('good', 0.0498788991533739), ('not', 0.027758707381998506), ('headset', 0.026890036602387694), ('mic', 0.0201726826320126), ('sound', 0.01961138032707135), ('headphone', 0.018502892654876344), ('like', 0.015600571879981259), ('quality', 0.015325797624711293), ('work', 0.015031963250048448), ('hear', 0.014763619507915632), ('get', 0.01415322226530193), ('price', 0.012101055294568577), ('really', 0.011936912259624139), ('sound_quality', 0.01133695557057364), ('very', 0.011296090218946906), ('game', 0.01058950256025348), ('gaming', 0.010268941466613389), ('great', 0.00936829189253216), ('ear', 0.008363085079197832), ('buy', 0.007729549551242613)], [('good', 0.04941575485727385), ('not', 0.02793539978608407), ('headset', 0.027725140989623157), ('mic', 0.020366245473751366), ('sound', 0.01956776042302582), ('headphone', 0.018655948544802268), ('like', 0.01569490070695153), ('quality', 0.015372459940964589), ('work', 0.015158932815920458), ('hear', 0.01484443403201802), ('get', 0.01382090

### **Document - Topic Proportions**

In [151]:
words = [id2word[word_id] for word_id,count in bow_corpus[1000]]
print(words)

['not', 'way', 'product', 'terrible', 'too', 'function', 'work', 'support', 'keyboard', 'stop', 'klim', 'show', 'description', 'avoid']


From the arbitarily topic; topic 1000, we can see that it is about bad quality especially about keyboards.
Could it be about topic 4 or 5 & 6?

In [152]:
# Checking the corpus topic distribution for documemt 1000
doc_topic_dist = ldaseq.doc_topics(1000) # 1000th document
doc_topic_dist

array([6.22665006e-04, 6.22665006e-04, 6.22665006e-04, 2.99283805e-01,
       6.98225535e-01, 6.22665006e-04])

Seems document 1000 is related to topic 5; bad quality

In [153]:
# Testing with a dummy document
doc_bad_quality_1 = ["hate",'worthless','sound_quality','mouse','purchase','never']
doc_bad_quality_1 = id2word.doc2bow(doc_bad_quality_1)
doc_bad_quality_1 = ldaseq[doc_bad_quality_1]
print(doc_bad_quality_1)

[0.24628144 0.00165017 0.00165017 0.00165017 0.7471179  0.00165017]


This document is highly related to topic 5 with some traces of topic 1

### **Distances between documents**

In [171]:
# def compute_words_topic_dist(doc_id:int):
#     try:
#         words = [id2word[word_id] for word_id,count in bow_corpus[doc_id]]
#         doc_topic_dist = ldaseq.doc_topics(doc_id)
#         return words,doc_topic_dist
#     except (TypeError,IndexError):
#         print("Wrong type passed or wrong index passed, max doc_id is 17639")

In [188]:
def compute_words_topic_dist(doc_id:int,model):

    """Computes the words and topic distribution 
    
    Parameters
        ----------
        doc_id : document id
            Input document id.
        model: topic model
            Input topic model
      
        Returns
        ------
        list of words
        topic distribution of a document
         """
    try:
        words = [id2word[word_id] for word_id,count in bow_corpus[doc_id]]
        doc_topic_dist = model.doc_topics(doc_id)
        return words,doc_topic_dist
    except (TypeError):
        print("Wrong type was passed")
    except (IndexError):
        print("Wrong index passed, max doc_id is 17639")

In [180]:
words_1000,doc_topic_dist_1000 = compute_words_topic_dist(1000,ldaseq)
print(words_1000)
print(doc_topic_dist_1000)

['not', 'way', 'product', 'terrible', 'too', 'function', 'work', 'support', 'keyboard', 'stop', 'klim', 'show', 'description', 'avoid']
[6.22665006e-04 6.22665006e-04 6.22665006e-04 2.99283805e-01
 6.98225535e-01 6.22665006e-04]


In [181]:
words_2000,doc_topic_dist_2000 = compute_words_topic_dist(2000,ldaseq)
print(words_2000)
print(doc_topic_dist_2000)

['absolutely_love']
[0.00943396 0.00943396 0.95283019 0.00943396 0.00943396 0.00943396]


The Hellinger distance ranges from 0 to 1, with 0 indicating that the two distributions are identical, and 1 indicating that they are completely dissimilar.

In [183]:
hellinger(doc_topic_dist_1000,doc_topic_dist_2000)
# They are dissimilar

0.9132769737138369

The above topics are highly disimilar

### **Choosing your best Dynamic Topic Model**

**Chain Variance**

One of the key aspects of topic evolution is how fast/slow these topics evolve. And this is where the factor of variance comes in. By setting the chain_variance input to the DTM model higher, we can tweak our topic evolution. The default value is 0.005. (this is the value suggested by Blei in his tech talk and is the default value in the C++ code)

In [185]:
# Default chain_variance value: 0.005
# Let's increase the value
ldaseq_chain = models.ldaseqmodel.LdaSeqModel(corpus=bow_corpus,
                                    id2word=id2word,
                                    time_slice=time_slice_months,
                                    num_topics=6,
                                    random_state=42,
                                    chunksize=2000,
                                    passes=50,
                                    chain_variance=0.05)

In [191]:
# 2018,topics per time
print_topics_per_time(0,ldaseq_chain) 

[[('good', 0.08517769885023538), ('not', 0.0790870746849555), ('like', 0.04453594810748606), ('price', 0.029702482457073794), ('really', 0.02562417559911568), ('sound', 0.021495563572498745), ('quality', 0.014373798936602115), ('work', 0.01392017034115687), ('get', 0.01288295713732411), ('hear', 0.012329787487638643), ('play', 0.011430082015009344), ('game', 0.009764396049783738), ('well', 0.008890746825075176), ('very', 0.008879533624722756), ('cheap', 0.00857651486790098), ('pretty', 0.008400527608924525), ('headset', 0.008022944122892246), ('buy', 0.007887749302401778), ('too', 0.007757005368380634), ('bad', 0.0077010132132301805)], [('speaker', 0.07755361788975619), ('sound', 0.055575534562469706), ('great', 0.029717933277689425), ('jbl', 0.021407448612190422), ('charge', 0.018675573890845863), ('good', 0.01628467857795942), ('bass', 0.016134308154287992), ('battery_life', 0.01312947130064325), ('get', 0.012596672756263628), ('sound_quality', 0.011744560078546417), ('very', 0.01091

In [192]:
# Looking into topic one's evolution over time.
print_topics_over_time(0,ldaseq_chain)

[[('good', 0.08517769885023538), ('not', 0.0790870746849555), ('like', 0.04453594810748606), ('price', 0.029702482457073794), ('really', 0.02562417559911568), ('sound', 0.021495563572498745), ('quality', 0.014373798936602115), ('work', 0.01392017034115687), ('get', 0.01288295713732411), ('hear', 0.012329787487638643), ('play', 0.011430082015009344), ('game', 0.009764396049783738), ('well', 0.008890746825075176), ('very', 0.008879533624722756), ('cheap', 0.00857651486790098), ('pretty', 0.008400527608924525), ('headset', 0.008022944122892246), ('buy', 0.007887749302401778), ('too', 0.007757005368380634), ('bad', 0.0077010132132301805)], [('not', 0.08363187611682403), ('good', 0.08235043918382962), ('like', 0.04220034039133044), ('price', 0.02993373550795783), ('really', 0.02560143184602734), ('sound', 0.021908965663540548), ('work', 0.014298968077770277), ('quality', 0.014047602683751818), ('get', 0.012665726573216669), ('hear', 0.01262611831493083), ('play', 0.011680788918465362), ('ga

**Observations**
- Taking the word "good" as a case study, before the modification of the chain_variance, it is observed that the word good ranks first over the the five years interval.
- Having modified chain_variance, the following can be observed for rank position for the word "good";
    - 2018 --> ranked 1st
    - 2019 --> ranked 2nd
    - 2020 --> ranked 2nd
    - 2021 --> ranked 1st
    - 2022 --> ranked 3rd

In [193]:
words_1000,doc_topic_dist_1000 = compute_words_topic_dist(1000,ldaseq_chain)
print(words_1000)
print(doc_topic_dist_1000)

['not', 'way', 'product', 'terrible', 'too', 'function', 'work', 'support', 'keyboard', 'stop', 'klim', 'show', 'description', 'avoid']
[6.22665006e-04 6.22665006e-04 6.22665006e-04 6.22665006e-04
 9.96886675e-01 6.22665006e-04]


In [194]:
words_2000,doc_topic_dist_2000 = compute_words_topic_dist(2000,ldaseq_chain)
print(words_2000)
print(doc_topic_dist_2000)

['absolutely_love']
[0.00943396 0.00943396 0.95283019 0.00943396 0.00943396 0.00943396]


In [195]:
# Computing similarity
hellinger(doc_topic_dist_1000,doc_topic_dist_2000)
# They are dissimilar

0.932185819740983

**Observation**

Due to the chain_variance modification, the topic distribution has also been affected;
- Document id 1000, its relation to topic 5 has increased from 0.698 to 0.997.
- Document id 2000, its relation to topic 3 has remained unchanged.
- Hellinger dissimiarity has increased from 0.913 to 0.932

### **LDA Model and DTM**

In [199]:
doc_topic, topic_term, doc_length, term_freq, vocab = ldaseq.dtm_vis(time=0,corpus=bow_corpus)
dtm_vis = pyLDAvis.prepare(topic_term_dists=topic_term,
                            doc_topic_dists=doc_topic,
                            doc_lengths=doc_length,
                            vocab=vocab,
                            term_frequency=term_freq,
                            sort_topics=False
                            )
pyLDAvis.save_html(dtm_vis, 'topic_visuals_bigrams/dtm6_default_chain_var.html')
pyLDAvis.display(dtm_vis) # 2018 topics

### **Computing Coherence Score**

In [None]:
"""Get the coherence for each topic.

Can be used to measure the quality of the model, 
or to inspect the convergence through training via a callback.

Parameters
----------
time : int
The time slice.

Returns
-------
list of list of str
The word representation for each topic, for each time slice. 
This can be used to check the time coherence
of topics as time evolves: 
If the most relevant words remain the same then the topic has somehow
converged or is relatively static, if they change rapidly the topic is evolving.
"""

In [219]:
ldaseq.dtm_coherence(0)[0] # 2018, topic 0

['good',
 'not',
 'headset',
 'mic',
 'sound',
 'headphone',
 'like',
 'quality',
 'work',
 'hear',
 'get',
 'price',
 'really',
 'sound_quality',
 'very',
 'game',
 'gaming',
 'great',
 'ear',
 'buy']

In [225]:
ldaseq.dtm_coherence(3)[0]

['good',
 'headset',
 'not',
 'mic',
 'sound',
 'headphone',
 'like',
 'work',
 'hear',
 'quality',
 'get',
 'price',
 'really',
 'very',
 'sound_quality',
 'game',
 'gaming',
 'great',
 'ear',
 'light']

Minimal evolution, meaning relatively static.