#RERUN: Do Guided LDA modelling for 7 topics, and group data in 5 years:

Also, I want to probably model each group of 5 years at a time, seeing the output as well

---

##Step 4: Checkpoint for reloading data


In [1]:
import pandas as pd
import re

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import pickle
from tqdm import tqdm

In [3]:
#Run this if reinstancing the environment and don't want to go through the above steps:
df_loaded = pd.read_csv('/content/drive/MyDrive/IMDB Project/review_analysis/data/merged_data_pre_LDA.csv')

  df_loaded = pd.read_csv('/content/drive/MyDrive/IMDB Project/review_analysis/data/merged_data_pre_LDA.csv')


In [4]:
#Make a copy for editing
df = df_loaded.copy()

---

In [5]:
# There are 2 release dates, time to drop one of them and rename the other one
df.drop('release_date_y', axis=1, inplace=True)
df.rename(columns={'release_date_x': 'release_date'}, inplace=True)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,reviewer,rating,imdb_id,title,preprocessed_review,release_date,Genres
0,0,bob the moo,,tt0114126,Thunderbolt,"['jackie', 'loh', 'chan', 'motor', 'mechanic',...",1995-08-05,"Action, Crime, Thriller"
1,1,sagacity_,,tt0114126,Thunderbolt,"['one', 'important', 'thing', 'jackie', 'chan'...",1995-08-05,"Action, Crime, Thriller"
2,2,rutt13-1,8.0,tt0114126,Thunderbolt,"['read', 'somewher', 'jackie', 'still', 'recov...",1995-08-05,"Action, Crime, Thriller"
3,3,ma-cortes,6.0,tt0114126,Thunderbolt,"['moving', 'picture', 'deal', 'chan', 'foh', '...",1995-08-05,"Action, Crime, Thriller"
4,4,OllieSuave-007,6.0,tt0114126,Thunderbolt,"['another', 'action-packed', 'movie', 'starrin...",1995-08-05,"Action, Crime, Thriller"


---

##Step 5: Preparing for LDA - By Date

>Before we perform LDA, we need to prepare our data. Since you want to perform LDA by year and by genre, we need to group our data accordingly.


>We'll start with grouping by every 5 years

In [7]:
import numpy as np

In [8]:
# Convert 'release_date' to datetime format
df['release_date'] = pd.to_datetime(df['release_date'])

# Extract year from 'release_date'
df['year'] = df['release_date'].dt.year

In [9]:
# Create a new column 'year_group' to represent each 5-year period
df['year_group'] = (df['year'] // 5) * 5
# Display the first few rows of the dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,reviewer,rating,imdb_id,title,preprocessed_review,release_date,Genres,year,year_group
0,0,bob the moo,,tt0114126,Thunderbolt,"['jackie', 'loh', 'chan', 'motor', 'mechanic',...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0
1,1,sagacity_,,tt0114126,Thunderbolt,"['one', 'important', 'thing', 'jackie', 'chan'...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0
2,2,rutt13-1,8.0,tt0114126,Thunderbolt,"['read', 'somewher', 'jackie', 'still', 'recov...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0
3,3,ma-cortes,6.0,tt0114126,Thunderbolt,"['moving', 'picture', 'deal', 'chan', 'foh', '...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0
4,4,OllieSuave-007,6.0,tt0114126,Thunderbolt,"['another', 'action-packed', 'movie', 'starrin...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0


# Step 6: Installing GuidedLDA

In [10]:
!pip install lda



In [11]:
!wget https://raw.githubusercontent.com/dex314/GuidedLDA_WorkAround/master/glda_datasets.py
!wget https://raw.githubusercontent.com/dex314/GuidedLDA_WorkAround/master/guidedlda.py
!wget https://raw.githubusercontent.com/dex314/GuidedLDA_WorkAround/master/guidedutils.py


--2023-07-31 05:36:34--  https://raw.githubusercontent.com/dex314/GuidedLDA_WorkAround/master/glda_datasets.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 929 [text/plain]
Saving to: ‘glda_datasets.py.1’


2023-07-31 05:36:35 (46.1 MB/s) - ‘glda_datasets.py.1’ saved [929/929]

--2023-07-31 05:36:35--  https://raw.githubusercontent.com/dex314/GuidedLDA_WorkAround/master/guidedlda.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13196 (13K) [text/plain]
Saving to: ‘guidedlda.py.1’


2023-07-31 05:36:35 (113 MB/s) - ‘guidedlda.

## Step 7 Conducting Guided LDA on each group of 5 years

In [12]:
df

Unnamed: 0.1,Unnamed: 0,reviewer,rating,imdb_id,title,preprocessed_review,release_date,Genres,year,year_group
0,0,bob the moo,,tt0114126,Thunderbolt,"['jackie', 'loh', 'chan', 'motor', 'mechanic',...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0
1,1,sagacity_,,tt0114126,Thunderbolt,"['one', 'important', 'thing', 'jackie', 'chan'...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0
2,2,rutt13-1,8.0,tt0114126,Thunderbolt,"['read', 'somewher', 'jackie', 'still', 'recov...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0
3,3,ma-cortes,6.0,tt0114126,Thunderbolt,"['moving', 'picture', 'deal', 'chan', 'foh', '...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0
4,4,OllieSuave-007,6.0,tt0114126,Thunderbolt,"['another', 'action-packed', 'movie', 'starrin...",1995-08-05,"Action, Crime, Thriller",1995.0,1995.0
...,...,...,...,...,...,...,...,...,...,...
3671316,3680052,The Best of the Lads 2021,It is long but definitely worth the watch. Thi...,Tikibaja147,9,['tt16293944'],NaT,,,
3671317,3680053,The Best of the Lads 2021,This collection is the last in the Pavilion sa...,Ragnarokslayer746,10,['tt16293944'],NaT,,,
3671318,3680054,The Best of the Lads 2021,The amount of pressure making this must've bee...,Driver1268964,9,['tt16293944'],NaT,,,
3671319,3680055,The Best of the Lads 2021,Ending a 'saga' of films is hard. But the best...,Joemamaha,10,['tt16293944'],NaT,,,


In [13]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

In [14]:
import lda
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import pickle
from tqdm import tqdm

In [15]:
import lda.guidedlda

In [16]:
# Define the 5-year intervals
intervals = [(2022,2023),(2010,2023), (1980,2009)]

In [17]:
# Define the seed topics
seed_topic_list = [['actor', 'actress', 'role', 'cast', 'star', 'performer'],
                   ['plot', 'story', 'narrative', 'storyline', 'plotline', 'script', 'screenplay', 'twist'],
                   ['cinematography', 'camera', 'visual', 'shot', 'scene', 'lighting', 'photography', 'angle'],
                   ['effect', 'cgi', 'animation', 'visual', 'special', 'graphic', '3d', 'technology'],
                   ['music', 'soundtrack', 'score', 'song', 'theme', 'sound', 'audio', 'composer'],
                   ['character', 'role', 'cast', 'protagonist', 'antagonist', 'hero', 'villain', 'personality'],
                   ['message', 'symbol', 'idea', 'concept', 'philosophy', 'meaning', 'motif']]


In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [18]:
# Create a dictionary that maps word to topic
word2topic = {}
for i, words in enumerate(seed_topic_list):
    for word in words:
        word2topic[word] = i

Model Saving

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# For each interval
for start, end in tqdm(intervals):
    # Filter the dataframe to only include rows from the current interval
    df_interval = df[(df['year'] >= start) & (df['year'] <= end)]

    # Create a Gensim dictionary from the processed reviews
    dictionary = Dictionary(df_interval['preprocessed_review'].str.split())

    # Create a CountVectorizer instance
    vectorizer = CountVectorizer(vocabulary=list(dictionary.token2id.keys()))

    # Convert the corpus to a document-term matrix
    X = vectorizer.fit_transform([' '.join(text) for text in df_interval['preprocessed_review'].str.split()])

    # Create a seed structure that can be used by GuidedLDA
    seed_topics = {}
    for t_id, st in enumerate(seed_topic_list):
        for word in st:
            try:
                seed_topics[dictionary.token2id[word]] = t_id
            except KeyError:  # If the word is not in the dictionary, ignore it
                pass

    # Create a GuidedLDA model
    model = lda.guidedlda.GuidedLDA(n_topics=7, n_iter=100, random_state=42, refresh=20)

    # Fit the model
    model.fit(X, seed_topics=seed_topics, seed_confidence=0.5)

    # Save the LdaModel
    with open(f'/content/drive/MyDrive/IMDB Project/review_analysis/Guided_LDA_models/lda_{start}_{end}.pkl', 'wb') as file:
        pickle.dump(model, file)

    # Load the LdaModel
    with open(f'/content/drive/MyDrive/IMDB Project/review_analysis/Guided_LDA_models/lda_{start}_{end}.pkl', 'rb') as f:
        model = pickle.load(f)

    # Get the topic-word matrix
    topic_word_matrix = model.topic_word_

    # Plot the distribution for each topic
    for i in range(topic_word_matrix.shape[0]):
        # Create a dataframe for the current topic
        df_topic = pd.DataFrame({'Word': [dictionary[i] for i in range(len(dictionary))], 'Importance': topic_word_matrix[i, :]})

        # Sort the dataframe by importance
        df_topic = df_topic.sort_values('Importance', ascending=False)

        # Create a bar plot using Plotly
        fig = px.bar(df_topic.head(10), x='Word', y='Importance', title=f'Topic {i} Word Distribution for {start}-{end}')
        fig.show()





Upper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documents





100%|██████████| 3/3 [10:24<00:00, 208.18s/it]


Visualization

In [None]:
!pip install pyLDAvis

In [None]:
!pip install gensim matplotlib

In [None]:
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# For each interval
for start, end in tqdm(intervals):
    # Load the LdaModel
    with open(f'/content/drive/MyDrive/IMDB Project/review_analysis/Guided_LDA_models/lda_{start}_{end}.pkl', 'rb') as f:
        model = pickle.load(f)

    # Print the top words for each topic
    for i, topic in enumerate(model.topic_word_):
        top_words = np.argsort(topic)[::-1][:10]
        print(f"Top words for topic {i}:")
        print([dictionary[j] for j in top_words])
        print("\n")

    # Calculate and print the coherence of the model
    cm = CoherenceModel(model=model, texts=df['preprocessed_review'].str.split(), dictionary=dictionary, coherence='c_v')
    coherence = cm.get_coherence()
    print(f"Coherence of the model for interval {start}-{end}: {coherence}\n")

    # Plot the coherence
    plt.figure(figsize=(10,5))
    plt.plot(range(1, len(cm.get_coherence_per_topic())+1), cm.get_coherence_per_topic())
    plt.xlabel("Topic")
    plt.ylabel("Coherence score")
    plt.title(f"Coherence score per topic for interval {start}-{end}")
    plt.show()


In [None]:
import pyLDAvis
import numpy as np

# For each interval
for start, end in tqdm(intervals):
    # Load the LdaModel
    with open(f'/content/drive/MyDrive/IMDB Project/review_analysis/Guided_LDA_models/lda_{start}_{end}.pkl', 'rb') as f:
        model = pickle.load(f)

    # Filter the dataframe to only include rows from the current interval
    df_interval = df[(df['year'] >= start) & (df['year'] <= end)]

    # Create a Gensim dictionary from the processed reviews
    dictionary = Dictionary(df_interval['preprocessed_review'].str.split())

    # Create a CountVectorizer instance
    vectorizer = CountVectorizer(vocabulary=list(dictionary.token2id.keys()))

    # Convert the corpus to a document-term matrix
    X = vectorizer.fit_transform([' '.join(text) for text in df_interval['preprocessed_review'].str.split()])

    # Get the topic-word matrix
    topic_word_matrix = model.topic_word_

    # Prepare data for pyLDAvis
    doc_topic_matrix = model.transform(X)
    doc_topic_matrix = doc_topic_matrix / doc_topic_matrix.sum(axis=1, keepdims=True)  # Normalize the rows to sum to 1
    doc_lengths = [len(doc) for doc in df_interval['preprocessed_review'].str.split()]
    term_frequency = np.squeeze(np.asarray(np.sum(X, axis=0)))
    vocab = list(dictionary.token2id.keys())

    data = {
        'topic_term_dists': topic_word_matrix,
        'doc_topic_dists': doc_topic_matrix,
        'doc_lengths': doc_lengths,
        'vocab': vocab,
        'term_frequency': term_frequency
    }

    # Create the visualization
    vis = pyLDAvis.prepare(**data)

    # Display the visualization
    pyLDAvis.display(vis)


## Step 8:

Conducting LDA on ALL the years together

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
import pickle

# Initialize a CountVectorizer to convert the text data into a bag-of-words
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)

# Fit and transform the processed reviews
count_data = count_vectorizer.fit_transform(df['preprocessed_review'])

# Save the feature names
feature_names = count_vectorizer.get_feature_names_out()
with open('/content/drive/MyDrive/IMDB Project/review_analysis/LDA_models/feature_names_all_years.pkl', 'wb') as file:
    pickle.dump(feature_names, file)

# Initialize the LDA model
lda = LDA(n_components=7, random_state=42)

# Fit the model to the count data
lda.fit(count_data)

# Save the LDA model
with open('/content/drive/MyDrive/IMDB Project/review_analysis/LDA_models/lda_all_years.pkl', 'wb') as file:
    pickle.dump(lda, file)
