In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
import string
import re
import spacy
import gensim
import warnings
warnings.filterwarnings("ignore")

### **Phrase and Dictionary Construction**

In [2]:
df = pd.read_csv("../data/cleaned.csv",usecols=['review','year'])
df.head()

Unnamed: 0,review,year
0,As usual the flight is delayed. BA try to blam...,2023
1,I had the most fantastic BA Flight today. The ...,2023
2,Couldn’t book in online. Arrived at check in t...,2023
3,London Heathrow to Mumbai in a Boeing 787-8 in...,2023
4,"Keflavík, Iceland to London Heathrow on an A32...",2023


In [3]:
texts = df.copy()

In [4]:
texts.describe()

Unnamed: 0,year
count,3604.0
mean,2017.078246
std,2.557902
min,2011.0
25%,2015.0
50%,2017.0
75%,2019.0
max,2023.0


In [5]:
texts.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3604 entries, 0 to 3603
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3604 non-null   object
 1   year    3604 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 3.7 MB


In [6]:
from spacy.lang.en.stop_words import STOP_WORDS
all_stopwords = STOP_WORDS
# Run this only once

# Before removing some stopwords
print(all_stopwords,"\n")

# Removing some stopwords
all_stopwords -= {
   "nor","cannot","serious","over",'or',"seemed",
   "not","no","neither","otherwise","except"
}

# After removing some stopwords
my_stop_words = set(all_stopwords) # My own stop words
print(my_stop_words)

{'does', 'thereupon', 'between', 'toward', 'wherein', 'that', 'yours', 'towards', '’m', 'other', 'our', 'together', 'sixty', 'keep', 'another', 'across', 'became', 'out', 'nor', 'unless', 'serious', 'latterly', 'anyway', 'not', 'any', 'first', 'namely', 'all', 'third', 'everywhere', 'several', 'here', 'part', 'this', "'s", 'per', 'bottom', 'twenty', 'herein', '‘ll', 'forty', 'cannot', 'since', 'rather', 'doing', 'before', 'your', 'of', 'what', 'being', 'whole', 'many', 'is', 'quite', 'give', 'they', 'those', 'should', 'least', 'over', 'have', 'fifteen', 'nine', 'been', 'she', 'ours', 'side', 'someone', 'these', 'a', 'due', 'at', 'already', 'make', 'n‘t', 'herself', 'n’t', 'last', 'others', 'less', 'around', 'off', 'will', 'seems', 'can', 'might', 'just', 'could', 'same', 'further', 'latter', 'when', 'wherever', 'against', 'whereupon', 'mine', '‘m', 'while', 'whenever', 'still', 'four', 'with', 'become', 'whose', 'eight', 'whatever', 'ten', '’d', 'but', 'until', 'why', 'somewhere', 'eve

In [7]:
def preprocess(sent):
    '''Cleans text data up, leaving only 2 or
        more char long non-stopwords composed of A-Z & a-z only
        in lowercase'''
    # lowercase
    sentence = sent.lower()

    # Remove RT
    sentence = re.sub('RT @\w+: '," ",sentence)

    # Remove special characters
    sentence = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", sentence)

    # Removing digits
    sentence = sentence.translate(str.maketrans('', '', string.digits))

    # Removing puntuactions
    # sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  
    # When we remove apostrophe from the word "Mark's", 
    # the apostrophe is replaced by an empty space. 
    # Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  
    # Next, we remove all the single characters and replace it by a space 
    # which creates multiple spaces in our text. 
    # Finally, we remove the multiple spaces from our text as well.

    return sentence

In [8]:
texts['cleaned_reviews'] = texts['review'].apply(preprocess)

In [9]:
nlp = spacy.load('en_core_web_sm')

def spcay_tokeniser(sent):
    sent = sent.strip().lower()
    doc = nlp(sent)
    mytokens = [token.lemma_ for token in doc if token.text not in my_stop_words]
    return mytokens

texts['tokens'] = texts['cleaned_reviews'].apply(spcay_tokeniser)

In [10]:
texts.head()

Unnamed: 0,review,year,cleaned_reviews,tokens
0,As usual the flight is delayed. BA try to blam...,2023,as usual the flight is delayed ba try to blame...,"[usual, flight, delay, ba, try, blame, inabili..."
1,I had the most fantastic BA Flight today. The ...,2023,i had the most fantastic ba flight today the c...,"[fantastic, ba, flight, today, cabin, crew, se..."
2,Couldn’t book in online. Arrived at check in t...,2023,couldn book in online arrived at check in to f...,"[couldn, book, online, arrive, check, find, bu..."
3,London Heathrow to Mumbai in a Boeing 787-8 in...,2023,london heathrow to mumbai in boeing in busines...,"[london, heathrow, mumbai, boeing, business, c..."
4,"Keflavík, Iceland to London Heathrow on an A32...",2023,keflav iceland to london heathrow on an in bus...,"[keflav, iceland, london, heathrow, business, ..."


In [11]:
def inspector(index_value=0):
    print(texts['review'][index_value],"\n")
    print(texts['cleaned_reviews'][index_value],"\n")
    print(texts['tokens'][index_value])
inspector(10)

A simple story with an unfortunate outcome that really could happen to anyone. My partner and I recently started working after studying purchased two tickets to travel from London City Airport to Frankfurt. When we purchased the tickets, I mistakenly entered my name twice (e.g. Mr John Smith and Ms John Smith). Little did we know that our 1 simple mistake would cost us over 300 pounds. Upon arriving at the airport we were told there was no way to change the name (apparently they can only change 3 letters where there has been a typo?) and I had no other option to purchase the last remaining ticket if I wanted to board the flight - the price: almost seven times (!) higher than my original ticket. Zero empathy was shown. Zero alternative was offered. Trusting BA's staff and under the pretence that there was apparently no other way we could board the flight we bought this ticket. Immediately after I purchased the ticket I contacted BA's 'Commercial Change Booking Team' and informed them of

In [12]:
# Compute bigrams
from gensim.models import Phrases
from gensim.models.phrases import Phraser,ENGLISH_CONNECTOR_WORDS

In [13]:
docs = texts['tokens'].tolist()

In [14]:
def inspector_two(index_value=0):
    print(docs[index_value])
inspector_two(10)

['simple', 'story', 'unfortunate', 'outcome', 'happen', 'partner', 'recently', 'start', 'work', 'study', 'purchase', 'ticket', 'travel', 'london', 'city', 'airport', 'frankfurt', 'purchase', 'ticket', 'mistakenly', 'enter', 'twice', 'mr', 'john', 'smith', 'ms', 'john', 'smith', 'little', 'know', 'simple', 'mistake', 'cost', 'over', 'pound', 'arrive', 'airport', 'tell', 'no', 'way', 'change', 'apparently', 'change', 'letter', 'typo', 'no', 'option', 'purchase', 'remain', 'ticket', 'want', 'board', 'flight', 'price', 'seven', 'time', 'high', 'original', 'ticket', 'zero', 'empathy', 'show', 'zero', 'alternative', 'offer', 'trust', 'ba', 'staff', 'pretence', 'apparently', 'no', 'way', 'board', 'flight', 'buy', 'ticket', 'immediately', 'purchase', 'ticket', 'contact', 'ba', 'commercial', 'change', 'book', 'team', 'inform', 'situation', 'service', 'representative', 'apologise', 'tell', 'change', 'cost', 'small', 'fee', 'offer', 'cancel', 'original', 'ticket', 'issue', 'partial', 'refund', 'a

In [15]:
# https://stackoverflow.com/questions/56909294/how-to-set-time-slices-dynamic-topic-model
# You must order from oldest date to newest date
texts = texts.sort_values(by='year',ascending=True) 

In [16]:
texts= texts.reset_index()

In [17]:
texts.head()

Unnamed: 0,index,review,year,cleaned_reviews,tokens
0,3603,SIN-LHR BA12 B747-436 First Class. Old aircraf...,2011,sin lhr ba first class old aircraft with seats...,"[sin, lhr, ba, class, old, aircraft, seat, not..."
1,3602,London City-New York JFK via Shannon on A318 b...,2011,london city new york jfk via shannon on but ha...,"[london, city, new, york, jfk, shannon, nice, ..."
2,3601,My son who had worked for British Airways urge...,2011,my son who had worked for british airways urge...,"[son, work, british, airways, urge, fly, briti..."
3,3589,Heathrow Marrakech. Had previously travelled o...,2012,heathrow marrakech had previously travelled on...,"[heathrow, marrakech, previously, travel, brit..."
4,3590,Flew return in CW from LHR to BKK in August 20...,2012,flew return in cw from lhr to bkk in august th...,"[fly, return, cw, lhr, bkk, august, positive, ..."


In [18]:
texts.drop('index',axis=1,inplace=True)

In [19]:
texts.head()

Unnamed: 0,review,year,cleaned_reviews,tokens
0,SIN-LHR BA12 B747-436 First Class. Old aircraf...,2011,sin lhr ba first class old aircraft with seats...,"[sin, lhr, ba, class, old, aircraft, seat, not..."
1,London City-New York JFK via Shannon on A318 b...,2011,london city new york jfk via shannon on but ha...,"[london, city, new, york, jfk, shannon, nice, ..."
2,My son who had worked for British Airways urge...,2011,my son who had worked for british airways urge...,"[son, work, british, airways, urge, fly, briti..."
3,Heathrow Marrakech. Had previously travelled o...,2012,heathrow marrakech had previously travelled on...,"[heathrow, marrakech, previously, travel, brit..."
4,Flew return in CW from LHR to BKK in August 20...,2012,flew return in cw from lhr to bkk in august th...,"[fly, return, cw, lhr, bkk, august, positive, ..."


In [20]:
# Add bigrams to docs (only ones that appear 20 times or more)
bigram_phrases = Phrases(docs, min_count=20,connector_words=ENGLISH_CONNECTOR_WORDS)
bigram = Phraser(bigram_phrases)

In [21]:
def make_bigram(texts):
    return([bigram[doc] for doc in texts])

bigram_docs = make_bigram(docs)

In [22]:
def phrase_inspector(index_value=0):
    print(bigram_docs[index_value])
inspector(50)
print()
phrase_inspector(50)

Flight 103 on October 14. We flew from London to Calgary Alberta - a long cramped ride. It was on the new Dreamliner with very tight seats the people in front had to raise their seats when lunch was served as the trays hit our stomachs! Service almost non-existent even on a 9 hour trip! Beverages served once and then at dinner the cart with wine got two rows before us (row35) and they quickly returned it to the back as they felt they were late with the coffee didn't even ask us whether we would like anything. When the coffee did come it was lukewarm. There was not one good thing about our flight. We will never travel British Airways again. 

flight on october we flew from london to calgary alberta long cramped ride it was on the new dreamliner with very tight seats the people in front had to raise their seats when lunch was served as the trays hit our stomachs service almost non existent even on hour trip beverages served once and then at dinner the cart with wine got two rows before u

In [23]:
# Remove rare adn common tokens
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents
id2word = Dictionary(bigram_docs)
# or 
# dictionary = Dictionary(docs)

# Filter out words that occur in less than 20 documents, or 
# more than 50% of the documents
# https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.filter_extremes
id2word.filter_extremes(no_below=20,no_above=0.6)

In [24]:
id2word.most_common(15) # most common words

[('seat', 4111),
 ('no', 2770),
 ('service', 2468),
 ('fly', 2382),
 ('good', 2374),
 ('time', 2372),
 ('food', 2078),
 ('british_airways', 1606),
 ('hour', 1578),
 ('or', 1532),
 ('airline', 1478),
 ('check', 1463),
 ('staff', 1423),
 ('london', 1354),
 ('passenger', 1271)]

In [25]:
# Bag-of-words representation of the documents.
# Term document frequency 
bow_corpus = [id2word.doc2bow(doc) for doc in bigram_docs]

# Create corpus
text_corpus = bigram_docs

In [26]:
print(f"Number of unique tokens: {len(id2word)}")
print(f"Number of documents: {len(bow_corpus)}")

Number of unique tokens: 1594
Number of documents: 3604


In [27]:
# Verifyting BOW is set correctly
def bow_verifier(index_value=0):
    print("Original text and tokens")
    inspector(index_value)
    print("\nPhrases")
    phrase_inspector(index_value)
    print()
    print(f"BoW representation: {bow_corpus[index_value]}")

    doc = f"document_{index_value}"
    doc = bow_corpus[index_value]
    for i in range(len(doc)):
        print(f"Word {doc[i][0]}, {id2word[doc[i][0]]}, appears {doc[i][1]}")

bow_verifier(10)

Original text and tokens
Just got back from Bridgetown Barbados flying with British Airways and frankly couldn't wait to get off. I was so disappointed. I have travelled regularly with BA but the flights going over and coming back left a lot to be desired. The plane was in need of repair. I had water dripping from the ceiling where I was sitting and when I told the stewardess was told that was usual due to condensation and proceeded to stuff a tissue to stop the dripping. The toilets were so old that bits of the cladding were coming away. The food was awful and inedible. The staff were unapproachable and looked fed up. BA get your act together or you'll be losing flyers by the hundreds. 

just got back from bridgetown barbados flying with british airways and frankly couldn wait to get off was so disappointed have travelled regularly with ba but the flights going over and coming back left lot to be desired the plane was in need of repair had water dripping from the ceiling where was sit

In [28]:
texts.year.sort_values().unique()

array([2011, 2012, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022,
       2023], dtype=int64)

In [29]:
texts.year.value_counts().sort_values()

year
2011      3
2012     12
2021     87
2020    104
2023    170
2022    213
2018    324
2019    356
2014    465
2016    570
2017    572
2015    728
Name: count, dtype: int64

|year|document count|
|---|---|
|2011    |  3|
|2012    | 12|
|2021    | 87|
|2020    |104|
|2023    |170|
|2022    |213|
|2018    |324|
|2019    |356|
|2014    |465|
|2016    |570|
|2017    |572|
|2015    |728|

In [30]:
years = [2011, 2012, 2014, 2015, 2016, 2017, 2018, 
2019, 2020, 2021, 2022,2023]

time_slice_years = [3, 12, 87, 104, 170, 213, 324,  
356, 465, 570, 572,728]

In [31]:
from gensim import models
from gensim.matutils import hellinger
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [32]:
ldaseq = models.ldaseqmodel.LdaSeqModel(
    corpus=bow_corpus,
    id2word=id2word,
    time_slice=time_slice_years,
    num_topics=4,
    random_state=42,
    passes=300,
    chunksize=100
)

### **Topics Per Time** 

For each year there 4 topics

In [38]:
def print_topics_per_time(time:int,model):
    try:
        print(model.print_topics(time=time),"\n")
    except:
        print("Please choose a time slice from 0-11 and must be a integer")

In [39]:
for year_index in range(len(years)):
    print(f"{years[year_index]}")
    print_topics_per_time(year_index,ldaseq)

2011
[[('seat', 0.037770465193787545), ('no', 0.02066108119583653), ('service', 0.01554384072833168), ('fly', 0.014640918697196689), ('business_class', 0.014054585367620703), ('food', 0.01364037006093668), ('airline', 0.012360168117097255), ('or', 0.010289852704322887), ('economy', 0.009866513872450204), ('meal', 0.008950181555379641), ('offer', 0.0084928896916951), ('pay', 0.008183971933556335), ('passenger', 0.007288862809173547), ('hour', 0.006754035987738513), ('bad', 0.006685853113956454), ('time', 0.006559928814566636), ('cabin', 0.006553378767815983), ('staff', 0.005950992721772907), ('plane', 0.005814869357256031), ('british_airways', 0.0054697605547509975)], [('time', 0.018827096401337604), ('check', 0.017898390190656987), ('no', 0.017465522126246275), ('service', 0.015630728916312703), ('lounge', 0.015220701107767092), ('boarding', 0.014018452059129381), ('passenger', 0.011414465341432575), ('gate', 0.01073454640885411), ('good', 0.010438950178750085), ('delay', 0.00945441958

**Oberservations**

All years had four topics in each of them

Let's take topic 0 as our case study and see the position of the **word 'service' and its word probability: ('service', 0.01554384072833168)at the third index**, as the topics per time seems to be the same.

Over the years, service lost its significance, as the probability reduced over time.
Service lost its position in years 2019 - 2023

In [73]:
years

[2011, 2012, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

### **Topics Over Time** 

In [40]:
# 6 topics: [0,1,2,3,4,5]
def print_topics_over_time(topic:int,model):
        try:
            print(model.print_topic_times(topic=topic),"\n")
        except IndexError:
            print("Please choose a topic number from 0 - 4 and must be an integer")

In [42]:
# Looking into each topic evolution over time.
for topic_number in range(4):
    print(f"Topic {topic_number} evolution over time")
    print_topics_over_time(topic_number,ldaseq)

Topic 0 evolution over time
[[('seat', 0.037770465193787545), ('no', 0.02066108119583653), ('service', 0.01554384072833168), ('fly', 0.014640918697196689), ('business_class', 0.014054585367620703), ('food', 0.01364037006093668), ('airline', 0.012360168117097255), ('or', 0.010289852704322887), ('economy', 0.009866513872450204), ('meal', 0.008950181555379641), ('offer', 0.0084928896916951), ('pay', 0.008183971933556335), ('passenger', 0.007288862809173547), ('hour', 0.006754035987738513), ('bad', 0.006685853113956454), ('time', 0.006559928814566636), ('cabin', 0.006553378767815983), ('staff', 0.005950992721772907), ('plane', 0.005814869357256031), ('british_airways', 0.0054697605547509975)], [('seat', 0.03789200770564521), ('no', 0.020711420562084807), ('service', 0.015578284292705878), ('fly', 0.014676093605785787), ('business_class', 0.014084762057600253), ('food', 0.013668925662207749), ('airline', 0.012386688720739478), ('or', 0.01030808030501611), ('economy', 0.009884395900601339), 

### **Document - Topic Proportions**

In [43]:
def doc_topic_proportion(doc_index=0):
    words = [id2word[word_id] for word_id,count in bow_corpus[doc_index]]
    print(words)

In [74]:
doc_topic_proportion(1000)

['staff', 'usual', 'food', 'good', 'meal', 'recommend', 'seat', 'spacious', 'think', 'time', 'lhr', 'no', 'aircraft', 'boarding', 'economy', 'journey', 'overall', 'give', 'pocket', 'storage', 'joke', 'especially', 'pay', 'let', 'return', 'little', 'support', 'apart', 'pleasant', 'need', 'uncomfortable', 'hard', 'virtually', 'second', 'leg', 'legroom', 'recline', 'wide', 'clean', 'ife', 'space', 'cabin', 'excellent', 'plus', 'world_traveller', 'miss', 'exception', 'adequate', 'odd', 'okay', 'attention', 'wonderful', 'big', 'whilst', 'storage_space', 'procedure', 'opportunity']


|Topic number|Topic description|
|---|---|
|Topic 0| In flight experience|
|Topic 1| Refreshments|
|Topic 2| Bad reviews|
|Topic 3| Good reviews|

In [75]:
# Checking the corpus topic distribution for documents
def doc_topic_distribution(doc_index=0):
    doc_topic_dist = ldaseq.doc_topics(doc_index) # documents
    print(doc_topic_dist)
    print()
doc_topic_distribution(1000)

[2.36204370e-01 1.51423380e-04 1.51423380e-04 7.63492783e-01]


In [84]:
# Testing with a dummy document
doc_review = ["hate","hostness","crew",'worthless',"terrible","refund",
                        'british_airways',"food","never","flight"]

doc_review = id2word.doc2bow(doc_review)
doc_review = ldaseq[doc_review]
print(doc_review)

[0.00198413 0.00198413 0.53681234 0.45921941]


In [85]:
def compute_words_topic_dist(doc_id:int,model):

    """Computes the words and topic distribution 
    
    Parameters
        ----------
        doc_id : document id
            Input document id.
        model: topic model
            Input topic model
      
        Returns
        ------
        list of words
        topic distribution of a document
         """
    try:
        words = [id2word[word_id] for word_id,count in bow_corpus[doc_id]]
        doc_topic_dist = model.doc_topics(doc_id)
        return words,doc_topic_dist
    except (TypeError):
        print("Wrong type was passed")
    except (IndexError):
        print("Wrong index passed, max doc_id is 17639")

In [48]:
words_1000,doc_topic_dist_1000 = compute_words_topic_dist(1000,ldaseq)
print(words_1000)
print(doc_topic_dist_1000)

['staff', 'usual', 'food', 'good', 'meal', 'recommend', 'seat', 'spacious', 'think', 'time', 'lhr', 'no', 'aircraft', 'boarding', 'economy', 'journey', 'overall', 'give', 'pocket', 'storage', 'joke', 'especially', 'pay', 'let', 'return', 'little', 'support', 'apart', 'pleasant', 'need', 'uncomfortable', 'hard', 'virtually', 'second', 'leg', 'legroom', 'recline', 'wide', 'clean', 'ife', 'space', 'cabin', 'excellent', 'plus', 'world_traveller', 'miss', 'exception', 'adequate', 'odd', 'okay', 'attention', 'wonderful', 'big', 'whilst', 'storage_space', 'procedure', 'opportunity']
[2.36204370e-01 1.51423380e-04 1.51423380e-04 7.63492783e-01]


In [49]:
words_2000,doc_topic_dist_2000 = compute_words_topic_dist(2000,ldaseq)
print(words_2000)
print(doc_topic_dist_2000)

['staff', 'try', 'cabin_crew', 'fly', 'food', 'good', 'hour', 'seat', 'time', 'airline', 'book', 'helpful', 'lhr', 'boarding', 'british_airways', 'configuration', 'departure', 'hear', 'service', 'bag', 'option', 'room', 'drink', 'wine', 'attentive', 'choice', 'screen', 'decide', 'stretch', 'cabin', 'welcome', 'bit', 'connect', 'professional', 'fare', 'centre', 'privacy', 'sydney', 'smooth', 'drop', 'lovely', 'report', 'september', 'singapore', 'secure', 'competitive']
[1.92159877e-04 1.92159877e-04 3.32521864e-02 9.66363494e-01]


The Hellinger distance ranges from 0 to 1, with 0 indicating that the two distributions are identical, and 1 indicating that they are completely dissimilar.

In [87]:
np.round(hellinger(doc_topic_dist_1000,doc_topic_dist_2000),4)
# They are somewhat similar

0.3967

The above topics are highly dissimilar

### **Choosing your best Dynamic Topic Model**

**Chain Variance**


One of the key aspects of topic evolution is how fast/slow these topics evolve. And this is where the factor of variance comes in. By setting the chain_variance input to the DTM model higher, we can tweak our topic evolution. The default value is 0.005. (this is the value suggested by Blei in his tech talk and is the default value in the C++ code)

In [52]:
# Default chain_variance value: 0.005
# Let's increase the value
ldaseq_chain = models.ldaseqmodel.LdaSeqModel(
                                        corpus=bow_corpus,
                                        id2word=id2word,
                                        time_slice=time_slice_years,
                                        num_topics=4,
                                        random_state=42,
                                        passes=300,
                                        chunksize=100,
                                        chain_variance=0.05
                                        )

In [53]:
for year_index in range(len(years)):
    print(f"{years[year_index]}")
    print_topics_per_time(year_index,ldaseq_chain)

2011
[[('seat', 0.04371517020613157), ('service', 0.020293293830283984), ('no', 0.019859138554178663), ('business_class', 0.01944989656996608), ('fly', 0.017058831696869472), ('economy', 0.013030197579637172), ('food', 0.013004271547611737), ('airline', 0.01217381646340316), ('pay', 0.009866092181403332), ('passenger', 0.009717247484255032), ('time', 0.009219555172142782), ('offer', 0.009017310084924927), ('or', 0.008979322771681505), ('meal', 0.00897089370392561), ('hour', 0.008374855481426526), ('staff', 0.008129063698898771), ('bad', 0.00732756173372573), ('plane', 0.006371431475772205), ('small', 0.005449170092646265), ('cabin', 0.00538402080287501)], [('no', 0.02371596954496203), ('check', 0.02062201172605952), ('gate', 0.018112027331920487), ('time', 0.01749184746854564), ('service', 0.01645582895707078), ('boarding', 0.016025657685082826), ('delay', 0.015427747802047202), ('lounge', 0.014391216755762902), ('staff', 0.012809215723515434), ('passenger', 0.009716155228372861), ('mi

**Oberservations**

All years had four topics in each of them

Let's take topic 0 as our case study and see the position of the **word 'business_class' and its word probability: ('business_class', 0.01944989656996608) at the fourth index**, as the topics per time seems to be the same.


- business_class lost its position in years 2014, 2015
- business_class increased to the third_index in years 2016-2017
- business_class dropped further down in years 2018-2023

|Topic number|Topic description|
|---|---|
|Topic 0| Refreshments |
|Topic 1| In flight experience|
|Topic 2| Bad reviews|
|Topic 3| Good reviews|

In [54]:
# Looking into each topic evolution over time.
for topic_number in range(4):
    print(f"Topic {topic_number} evolution over time")
    print_topics_over_time(topic_number,ldaseq_chain)

Topic 0 evolution over time
[[('seat', 0.04371517020613157), ('service', 0.020293293830283984), ('no', 0.019859138554178663), ('business_class', 0.01944989656996608), ('fly', 0.017058831696869472), ('economy', 0.013030197579637172), ('food', 0.013004271547611737), ('airline', 0.01217381646340316), ('pay', 0.009866092181403332), ('passenger', 0.009717247484255032), ('time', 0.009219555172142782), ('offer', 0.009017310084924927), ('or', 0.008979322771681505), ('meal', 0.00897089370392561), ('hour', 0.008374855481426526), ('staff', 0.008129063698898771), ('bad', 0.00732756173372573), ('plane', 0.006371431475772205), ('small', 0.005449170092646265), ('cabin', 0.00538402080287501)], [('seat', 0.04384466144024092), ('service', 0.020344517138258692), ('no', 0.019904679487214392), ('business_class', 0.019496303250822456), ('fly', 0.01709583907323249), ('economy', 0.013055769018480986), ('food', 0.013029826080463556), ('airline', 0.012202514044860772), ('pay', 0.009881568396900499), ('passenger

In [88]:
words_1000,doc_topic_dist_1000 = compute_words_topic_dist(1000,ldaseq_chain)
print(words_1000)
print(doc_topic_dist_1000)

['staff', 'usual', 'food', 'good', 'meal', 'recommend', 'seat', 'spacious', 'think', 'time', 'lhr', 'no', 'aircraft', 'boarding', 'economy', 'journey', 'overall', 'give', 'pocket', 'storage', 'joke', 'especially', 'pay', 'let', 'return', 'little', 'support', 'apart', 'pleasant', 'need', 'uncomfortable', 'hard', 'virtually', 'second', 'leg', 'legroom', 'recline', 'wide', 'clean', 'ife', 'space', 'cabin', 'excellent', 'plus', 'world_traveller', 'miss', 'exception', 'adequate', 'odd', 'okay', 'attention', 'wonderful', 'big', 'whilst', 'storage_space', 'procedure', 'opportunity']
[2.17640722e-01 1.51423380e-04 1.51423380e-04 7.82056431e-01]


In [89]:
words_2000,doc_topic_dist_2000 = compute_words_topic_dist(2000,ldaseq_chain)
print(words_2000)
print(doc_topic_dist_2000)

['staff', 'try', 'cabin_crew', 'fly', 'food', 'good', 'hour', 'seat', 'time', 'airline', 'book', 'helpful', 'lhr', 'boarding', 'british_airways', 'configuration', 'departure', 'hear', 'service', 'bag', 'option', 'room', 'drink', 'wine', 'attentive', 'choice', 'screen', 'decide', 'stretch', 'cabin', 'welcome', 'bit', 'connect', 'professional', 'fare', 'centre', 'privacy', 'sydney', 'smooth', 'drop', 'lovely', 'report', 'september', 'singapore', 'secure', 'competitive']
[1.92159877e-04 1.92159877e-04 1.14848494e-01 8.84767186e-01]


In [90]:
np.round(hellinger(doc_topic_dist_1000,doc_topic_dist_2000),4)
# They are somewhat similar

0.3967

### **LDA Model and DTM**

In [69]:
def plot_default_dtm(year_index=0):
    doc_topic, topic_term, doc_length, term_freq, vocab = ldaseq.dtm_vis(time=year_index,corpus=bow_corpus)
    dtm_vis = pyLDAvis.prepare(topic_term_dists=topic_term,
                                doc_topic_dists=doc_topic,
                                doc_lengths=doc_length,
                                vocab=vocab,
                                term_frequency=term_freq,
                                sort_topics=False
                                )
    pyLDAvis.save_html(dtm_vis, f'topic_visuals_bigrams/dtm/dtm4_{years[year_index]}_default_chain_var.html')
    pyLDAvis.display(dtm_vis) # 2011 topics

In [70]:
def plot_modified_dtm(year_index=0):
    doc_topic, topic_term, doc_length, term_freq, vocab = ldaseq_chain.dtm_vis(time=year_index,corpus=bow_corpus)
    dtm_vis = pyLDAvis.prepare(topic_term_dists=topic_term,
                                doc_topic_dists=doc_topic,
                                doc_lengths=doc_length,
                                vocab=vocab,
                                term_frequency=term_freq,
                                sort_topics=False
                                )
    pyLDAvis.save_html(dtm_vis, f'topic_visuals_bigrams/dtm/dtm4_{years[year_index]}_increased_chain_var.html')
    pyLDAvis.display(dtm_vis) # 2011 topics

In [71]:
for year_index in range(len(years)):
    plot_default_dtm(year_index)
    plot_modified_dtm(year_index)

### **Computing Coherence Score**

In [66]:
"""Get the coherence for each topic.

Can be used to measure the quality of the model, 
or to inspect the convergence through training via a callback.

Parameters
----------
time : int
The time slice.

Returns
-------
list of list of str
The word representation for each topic, for each time slice. 
This can be used to check the time coherence
of topics as time evolves: 
If the most relevant words remain the same then the topic has somehow
converged or is relatively static, if they change rapidly the topic is evolving.
"""

'Get the coherence for each topic.\n\nCan be used to measure the quality of the model, \nor to inspect the convergence through training via a callback.\n\nParameters\n----------\ntime : int\nThe time slice.\n\nReturns\n-------\nlist of list of str\nThe word representation for each topic, for each time slice. \nThis can be used to check the time coherence\nof topics as time evolves: \nIf the most relevant words remain the same then the topic has somehow\nconverged or is relatively static, if they change rapidly the topic is evolving.\n'

In [67]:
ldaseq.dtm_coherence(0)[0] # 2011, topic 0

['seat',
 'no',
 'service',
 'fly',
 'business_class',
 'food',
 'airline',
 'or',
 'economy',
 'meal',
 'offer',
 'pay',
 'passenger',
 'hour',
 'bad',
 'time',
 'cabin',
 'staff',
 'plane',
 'british_airways']

In [92]:
ldaseq.dtm_coherence(1)[0] # 2011, topic 0

['seat',
 'no',
 'service',
 'fly',
 'business_class',
 'food',
 'airline',
 'or',
 'economy',
 'meal',
 'offer',
 'pay',
 'passenger',
 'hour',
 'bad',
 'time',
 'cabin',
 'staff',
 'plane',
 'british_airways']

In [95]:
# def dtm_coherence(time1=0,time2=1,topic=0):
#     print(f"Coherence of {years[time1]} and {years[time2]}")
#     time_data = {
#                 years[time1]:[ldaseq.dtm_coherence(time1)[topic]],
#                 years[time2]:[ldaseq.dtm_coherence(time2)[topic]]
#     }
#     time_frame = pd.DataFrame(time_data)
#     return (time_frame)
# dtm_comapre = dtm_coherence()
# dtm_comapre

Coherence of 2011 and 2012


Unnamed: 0,2011,2012
0,"[seat, no, service, fly, business_class, food,...","[seat, no, service, fly, business_class, food,..."
