ChatGPT is the source of this Python Topic Modeling script :D

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
df = pd.read_csv('....invincible_reviews_subset.csv')

In [3]:
# Extract the reviews from the DataFrame
reviews_data = df['Review'].tolist()

In [4]:
# Create a CountVectorizer object to convert text to a matrix of token counts
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(reviews_data)

In [5]:
# Initialize and fit LDA model
num_topics = 2  # You can change this to adjust the number of topics
lda_model = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online', random_state=42)
lda_model.fit(X)

In [6]:
# Function to display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        topics[topic_idx] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topics)

In [7]:
# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Number of top words to display for each topic
no_top_words = 10

# Display topics
topics_df = display_topics(lda_model, feature_names, no_top_words)

# Initialize an empty list to store dataframes
dfs = []

In [8]:
# Iterate over topics and top words to populate the list of dataframes
for topic_idx, top_words in topics_df.items():
    word_freq_data = []
    for word in top_words:
        # Get the index of the word in the vocabulary
        word_index = vectorizer.vocabulary_[word]
        # Get the frequency of the word across all documents
        frequency = sum(X[:, word_index].toarray().flatten())
        # Append the word, topic, and frequency to the list
        word_freq_data.append({'Word': word, 'Topic': f'Topic {topic_idx+1}', 'Frequency': frequency})
    # Convert the list to a dataframe and append it to the list of dataframes
    dfs.append(pd.DataFrame(word_freq_data))


In [9]:
# Concatenate all dataframes in the list along rows
word_freq_df = pd.concat(dfs, ignore_index=True)

In [10]:
print("Top words and their frequency by topics:")
print(word_freq_df)

Top words and their frequency by topics:
          Word    Topic  Frequency
0         like  Topic 1        361
1         just  Topic 1        341
2         good  Topic 1        281
3        story  Topic 1        290
4       series  Topic 1        324
5       season  Topic 1        311
6   characters  Topic 1        243
7       really  Topic 1        221
8    animation  Topic 1        228
9    superhero  Topic 1        215
10     episode  Topic 2        248
11      series  Topic 2        324
12      season  Topic 2        311
13      comics  Topic 2        125
14        just  Topic 2        341
15       watch  Topic 2        205
16  invincible  Topic 2        219
17        like  Topic 2        361
18    episodes  Topic 2        106
19        wait  Topic 2         92


In [11]:
word_freq_df.to_csv('word_frequency.csv', index=False)