In [23]:
# notebooks/topic_modeling.ipynb
import sys
import os

import pandas as pd
scripts_path = os.path.abspath('../scripts')
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

from topicm import CountVectorizer, LatentDirichletAllocation


# Load your reviews CSV
file_path = "../data/raw_reviews.csv"  # Adjust path if needed
df = pd.read_csv(file_path)

# Initialize the TopicModeler
modeler = TopicModeler()

# Step 1: Preprocess Reviews
df = modeler.preprocess_reviews(df, text_column="review_text")
# Use the already loaded df variable

# Step 1: Preprocess Reviews (simple lowercase and remove punctuation as an example)
df["cleaned_review"] = df["review_text"].str.lower().str.replace(r'[^\w\s]', '', regex=True)

# Step 2: Vectorize cleaned reviews
vectorizer = CountVectorizer(stop_words='english')
doc_term_matrix = vectorizer.fit_transform(df["cleaned_review"].tolist())

# Step 3: Apply LDA for topic modeling
lda_model = LatentDirichletAllocation(n_components=400, random_state=42)
lda_model.fit(doc_term_matrix)

# Step 4: Get topic keywords
def get_topic_keywords(lda_model, vectorizer, n_top_words=10):
    feature_names = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_features)
    return topics

topics = get_topic_keywords(lda_model, vectorizer, n_top_words=10)

# Display topics
for i, topic in enumerate(topics):
    print(f"Topic #{i+1}: {', '.join(topic)}")

# Step 5: Assign dominant topic to each review
df["dominant_topic"] = lda_model.transform(doc_term_matrix).argmax(axis=1)

# Save to file
output_path = "../data/outputs/topic_modeled_reviews.csv"
df.to_csv(output_path, index=False)
print(f"✅ Topic modeling results saved to: {output_path}")


Topic #1: 05, ፒን, ፍጥነቱ, ፋይናንስ, ፈጣን, ፈልገ, ጽፈን, ጭራሽ, ጥሩ, absurd
Topic #2: bank, app, transactions, ussd, terrible, experience, embarrassing, time, transfer, situations
Topic #3: time, service, experience, transactions, longer, better, takes, response, app, reached
Topic #4: ግን, በጣም, seamlessly, እላፊ, shop, አይቀንስም, ብር, pay, ነው, ይላል
Topic #5: app, hate, doesnt, a32, abdu, ገንዘብ, 05, ፒን, ፍጥነቱ, added
Topic #6: 05, ፒን, ፍጥነቱ, ፋይናንስ, ፈጣን, ፈልገ, ጽፈን, ጭራሽ, ጥሩ, absurd
Topic #7: developer, sure, sorry, sense, mode, stupid, makes, hack, doesnt, does
Topic #8: 05, ፒን, ፍጥነቱ, ፋይናንስ, ፈጣን, ፈልገ, ጽፈን, ጭራሽ, ጥሩ, absurd
Topic #9: 05, ፒን, ፍጥነቱ, ፋይናንስ, ፈጣን, ፈልገ, ጽፈን, ጭራሽ, ጥሩ, absurd
Topic #10: happy, latest, app, super, superup, inclussive, genetu, dashen, assefa, money
Topic #11: support, totally, app, absolutely, love, let, foundation, fantastic, enjoying, chatting
Topic #12: አይሰራም, ለመላክ, ወደ, bank, ነበር, ይመስለኛል, ሞባይል, የሚለዉ, መጠየቅ, ዉስጥ
Topic #13: smooth, app, works, recently, user, work, good, friendly, fast, expri

In [1]:
# Detailed Report: Topic Modeling Pipeline Execution (topicm.ipynb)

report = """
## Topic Modeling Pipeline Report

### 1. Data Loading
- Loaded raw review data from '../data/raw_reviews.csv' into a pandas DataFrame.

### 2. Preprocessing
- Applied custom preprocessing using `modeler.preprocess_reviews` on the 'review_text' column.
- Additionally, performed simple text cleaning: converted to lowercase and removed punctuation, storing the result in a new column 'cleaned_review'.

### 3. Vectorization
- Used `CountVectorizer` (with English stop words) to convert the cleaned reviews into a document-term matrix.

### 4. Topic Modeling (LDA)
- Applied `LatentDirichletAllocation` with 400 topics and a fixed random seed for reproducibility.
- Fitted the LDA model to the document-term matrix.

### 5. Topic Extraction
- Extracted the top 10 keywords for each topic using a custom `get_topic_keywords` function.
- Printed the keywords for each topic for interpretability.

### 6. Topic Assignment
- Assigned the dominant topic to each review by finding the topic with the highest probability for each document.
- Stored the dominant topic in a new column 'dominant_topic'.

### 7. Output
- Saved the resulting DataFrame (with topic assignments) to '../data/outputs/topic_modeled_reviews.csv'.

### 8. CI/CD Context
- All steps are automated and reproducible, suitable for integration into a CI/CD pipeline for continuous topic modeling on new review data.

✅ **Task completed successfully. Results saved to output file.**
"""

print(report)


## Topic Modeling Pipeline Report

### 1. Data Loading
- Loaded raw review data from '../data/raw_reviews.csv' into a pandas DataFrame.

### 2. Preprocessing
- Applied custom preprocessing using `modeler.preprocess_reviews` on the 'review_text' column.
- Additionally, performed simple text cleaning: converted to lowercase and removed punctuation, storing the result in a new column 'cleaned_review'.

### 3. Vectorization
- Used `CountVectorizer` (with English stop words) to convert the cleaned reviews into a document-term matrix.

### 4. Topic Modeling (LDA)
- Applied `LatentDirichletAllocation` with 400 topics and a fixed random seed for reproducibility.
- Fitted the LDA model to the document-term matrix.

### 5. Topic Extraction
- Extracted the top 10 keywords for each topic using a custom `get_topic_keywords` function.
- Printed the keywords for each topic for interpretability.

### 6. Topic Assignment
- Assigned the dominant topic to each review by finding the topic with the highes

Classes available in topicm.py: ['CountVectorizer', 'LatentDirichletAllocation']
