In [None]:

# Topic Modeling

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')



# Text preprocessing function
def preprocess_text(text):
    if pd.isna(text) or text == "":
        return []

    # Convert to lowercase and tokenize
    tokens = simple_preprocess(str(text), deacc=True)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words and len(word) > 3]

def perform_lda(reviews, num_topics=5):
    # Create dictionary
    dictionary = corpora.Dictionary(reviews)

    # Filter out extreme values (optional)
    dictionary.filter_extremes(no_below=5, no_above=0.7)

    # Create document-term matrix
    corpus = [dictionary.doc2bow(review) for review in reviews]

    # Train LDA model
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
        alpha='auto',
        per_word_topics=True
    )

    return lda_model, corpus, dictionary
def display_topics(lda_model, num_topics, top_words=10):
    for topic_id in range(num_topics):
        print(f"Topic {topic_id + 1}:")
        words = lda_model.show_topic(topic_id, top_words)
        print(", ".join([word for word, prob in words]))
        print()
# Process the reviews for each unique app
unique_apps = df_combined['app_name'].unique()
print(f"Found {len(unique_apps)} unique apps: {', '.join(unique_apps)}")

for app in unique_apps:
    print(f"\n\n{'='*50}")
    print(f"Processing reviews for: {app}")
    print(f"{'='*50}")

    # Filter data for this app
    app_data = df_combined[df_combined['app_name'] == app]

    # Preprocess reviews
    reviews = app_data['review'].apply(preprocess_text).tolist()

    # Remove empty reviews
    reviews = [r for r in reviews if len(r) > 0]

    print(f"Number of reviews after preprocessing: {len(reviews)}")

    if len(reviews) < 10:
        print(f"Too few reviews for {app} after preprocessing. Skipping...")
        continue

    # Determine appropriate number of topics based on dataset size
    num_topics = min(5, max(2, len(reviews) // 100))

    # Perform LDA
    print(f"Training LDA model with {num_topics} topics...")
    lda_model, corpus, dictionary = perform_lda(reviews, num_topics)

    # Display results
    print(f"\nTop words for each topic in {app}:")
    display_topics(lda_model, num_topics)

    # Calculate coherence score
    from gensim.models.coherencemodel import CoherenceModel
    coherence_model = CoherenceModel(model=lda_model, texts=reviews, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence Score: {coherence_score:.4f}")
def find_optimal_topics(reviews, start=2, limit=10, step=1):
    coherence_values = []
    model_list = []

    for num_topics in range(start, limit, step):
        dictionary = corpora.Dictionary(reviews)
        dictionary.filter_extremes(no_below=5, no_above=0.7)
        corpus = [dictionary.doc2bow(review) for review in reviews]

        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=42,
            passes=10,
            alpha='auto'
        )

        model_list.append(lda_model)

        coherence_model = CoherenceModel(model=lda_model, texts=reviews, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())

    # Return the model with the highest coherence score
    optimal_model_index = coherence_values.index(max(coherence_values))
    optimal_model = model_list[optimal_model_index]
    optimal_topics = range(start, limit, step)[optimal_model_index]

    return optimal_model, optimal_topics, max(coherence_values), coherence_values
# Process the reviews for each unique app
unique_apps = df_combined['app_name'].unique()
print(f"Found {len(unique_apps)} unique apps: {', '.join(unique_apps)}")

# Set up a figure for all coherence plots
plt.figure(figsize=(15, 5*len(unique_apps)))

# Process each app
for i, app in enumerate(unique_apps):
    print(f"\n\n{'='*50}")
    print(f"Processing reviews for: {app}")
    print(f"{'='*50}")

    # Filter data for this app
    app_data = df_combined[df_combined['app_name'] == app]

    # Preprocess reviews
    app_reviews = app_data['review'].apply(preprocess_text).tolist()

    # Remove empty reviews
    app_reviews = [r for r in app_reviews if len(r) > 0]

    print(f"Number of reviews after preprocessing: {len(app_reviews)}")

    if len(app_reviews) < 10:
        print(f"Too few reviews for {app} after preprocessing. Skipping...")
        continue

    # Find optimal number of topics
    start_topics = 2
    limit_topics = min(20, max(10, len(app_reviews) // 50))  # Adjust based on dataset size
    step = 1

    print(f"Finding optimal number of topics between {start_topics} and {limit_topics}...")
    optimal_model, optimal_topics, best_coherence, coherence_values = find_optimal_topics(
        app_reviews, start=start_topics, limit=limit_topics, step=step
    )

    print(f"Optimal number of topics for {app}: {optimal_topics} with coherence score {best_coherence:.4f}")

    # Plot coherence scores for this app
    plt.subplot(len(unique_apps), 1, i+1)
    plt.plot(range(start_topics, limit_topics, step), coherence_values, marker='o')
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence Score")
    plt.title(f"Coherence Scores for {app}")
    plt.grid(True)

    # Display the optimal topics
    print(f"\nTop words for each topic in {app} (Optimal model with {optimal_topics} topics):")
    display_topics(optimal_model, optimal_topics)

    # Prepare corpus and dictionary for this model
    dictionary = corpora.Dictionary(app_reviews)
    dictionary.filter_extremes(no_below=5, no_above=0.7)
    corpus = [dictionary.doc2bow(review) for review in app_reviews]

    # Uncomment to visualize topics interactively (requires notebook environment)
    # vis = pyLDAvis.gensim_models.prepare(optimal_model, corpus, dictionary)
    # pyLDAvis.display(vis)

    # For demonstration, let's also run the model with optimal topics
    print(f"\nRunning final LDA model with optimal {optimal_topics} topics for {app}...")
    final_lda_model, final_corpus, final_dictionary = perform_lda(app_reviews, optimal_topics)

    # Calculate final coherence score
    final_coherence_model = CoherenceModel(
        model=final_lda_model, texts=app_reviews, dictionary=final_dictionary, coherence='c_v'
    )
    final_coherence_score = final_coherence_model.get_coherence()
    print(f"Final Coherence Score: {final_coherence_score:.4f}")

plt.tight_layout()
plt.show()

# Creating a dictionary to store all the optimal models
optimal_models = {}

for app in unique_apps:
    # Filter data for this app
    app_data = df_combined[df_combined['app_name'] == app]

    # Preprocess reviews
    app_reviews = app_data['review'].apply(preprocess_text).tolist()
    app_reviews = [r for r in app_reviews if len(r) > 0]

    if len(app_reviews) < 10:
        continue

    # Find optimal number of topics again
    start_topics = 2
    limit_topics = min(20, max(10, len(app_reviews) // 50))
    optimal_model, optimal_topics, best_coherence, _ = find_optimal_topics(
        app_reviews, start=start_topics, limit=limit_topics, step=1
    )

    # Store the model
    optimal_models[app] = {
        'model': optimal_model,
        'num_topics': optimal_topics,
        'coherence': best_coherence
    }

    print(f"Stored optimal model for {app} with {optimal_topics} topics (coherence: {best_coherence:.4f})")

# Create a summary dataframe of optimal topics and coherence scores
summary_data = []
for app, model_info in optimal_models.items():
    summary_data.append({
        'App Name': app,
        'Optimal Topics': model_info['num_topics'],
        'Coherence Score': model_info['coherence']
    })

summary_df = pd.DataFrame(summary_data)
print("\nOptimal Topic Summary:")
print(summary_df)