<a href="https://colab.research.google.com/github/clementh626/ANLP-Project/blob/main/Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Authenticate and access Google Sheets
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Google Sheets authentication
creds, _ = default()
gc = gspread.authorize(creds)

# Open the worksheet using its URL
worksheet = gc.open_by_url('https://docs.google.com/spreadsheets/d/1rmuxCVt7WhjNyE1vMfdXxIaKn5MPgFRXdHLvBm7L2Ng/edit?gid=1118986519#gid=1118986519').sheet1

# Fetch all data from the worksheet
rows = worksheet.get_all_values()

# Step 2: Convert the worksheet data to a DataFrame, skipping the first row (header)
df = pd.DataFrame(rows[1:], columns=rows[0])

# Print the DataFrame to verify
print(df)

# Step 3: Apply the topic modeling pipeline (provided earlier)

# Example pipeline for multiple rating categories

def topic_modeling_pipeline(df, category_columns, n_topics=5, n_top_words=10):
    """
    Function to extract keywords related to multiple rating categories.

    Args:
        df (pd.DataFrame): The input dataframe containing review titles and rating columns.
        category_columns (list): List of column names of the rating categories to focus on.
        n_topics (int): Number of topics for LDA.
        n_top_words (int): Number of top words to extract from each topic.

    Returns:
        dict: A dictionary where the key is the rating category and the value is a list of relevant keywords.
    """

    # Initialize an empty dictionary to store keywords for each category
    output_dict = {}

    # Loop through each rating category (topic)
    for category_column in category_columns:
        # Filter out rows where 'Review Title' or the category column is empty or NaN
        filtered_df = df.dropna(subset=['Review Title', category_column])
        filtered_df = filtered_df[filtered_df['Review Title'] != '']

        # Convert the 'Review Title' column into a list (corpus)
        corpus = filtered_df['Review Title'].tolist()

        # Step 1: Create document-term matrix using CountVectorizer
        vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
        dtm = vectorizer.fit_transform(corpus)

        # Step 2: Apply Latent Dirichlet Allocation (LDA)
        lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
        lda.fit(dtm)

        # Step 3: Extract top words for each topic
        feature_names = vectorizer.get_feature_names_out()
        keywords = []

        for topic_idx, topic in enumerate(lda.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
            keywords.extend(top_words)

        # Store the unique keywords for the category in the dictionary
        output_dict[category_column] = list(set(keywords))

    # Return the dictionary with relevant keywords for each rating category
    return output_dict

                       Company Page  \
0            0     101-Network    1   
1            1        1010data    1   
2            2  10X%20Genomics    1   
3            3  10X%20Genomics    1   
4            4  10X%20Genomics    1   
...        ...             ...  ...   
203536  203536          zeotap    1   
203537  203537          zimlab    1   
203538  203538            ztoo    1   
203539  203539         zuumers    1   
203540  203540           zyfra    1   

                                             Review Title Overall Rating  \
0                                            Not so great              3   
1       Ok work life balance, complex product operatin...              2   
2                                     Work hard play hard              4   
3                             Great internship experience              4   
4                                          So far so good              5   
...                                                   ...            ...   

In [None]:
categories = ['Work Life Balance', 'Management', 'Company Culture']
result_dict = topic_modeling_pipeline(df, category_columns=categories)

print("Keywords for each topic:")
print(result_dict)

Keywords for each topic:
{'Work Life Balance': ['start', 'pay', 'poor', 'culture', 'awesome', 'team', 'better', 'compensation', 'good', 'slow', 'place', 'best', 'work', 'people', 'company', 'bad', 'learning', 'career', 'life', 'lots', 'leadership', 'management', 'experience', 'tech', 'fast', 'growth', 'decent', 'environment', 'ok', 'great', 'working', 'balance', 'okay', 'wlb', 'like', 'opportunities', 'overall'], 'Management': ['start', 'pay', 'poor', 'culture', 'awesome', 'team', 'better', 'compensation', 'good', 'slow', 'place', 'best', 'work', 'people', 'company', 'bad', 'learning', 'career', 'life', 'lots', 'leadership', 'management', 'experience', 'tech', 'fast', 'growth', 'decent', 'environment', 'ok', 'great', 'working', 'balance', 'okay', 'wlb', 'like', 'opportunities', 'overall'], 'Company Culture': ['start', 'pay', 'poor', 'culture', 'awesome', 'team', 'better', 'compensation', 'good', 'slow', 'place', 'best', 'work', 'people', 'company', 'bad', 'learning', 'career', 'life', 