# 01 - Scrape Google Play Reviews

Scrape reviews for 3 Indonesian telco apps:
- **MyTelkomsel** (`com.telkomsel.telkomselcm`)
- **myXL** (`com.apps.MyXL`)
- **myIM3** (`com.pure.indosat.care`)

**Parameters:** English language, Indonesia country, max 10,000 reviews per app, last 3 months.

In [None]:
import pandas as pd
from google_play_scraper import Sort, reviews
from tqdm import tqdm
from datetime import datetime, timedelta
import os
import time

In [None]:
# App configurations
APPS = {
    "MyTelkomsel": "com.telkomsel.telkomselcm",
    "myXL": "com.apps.MyXL",
    "myIM3": "com.pure.indosat.care",
}

# Scraping parameters
LANG = "en"
COUNTRY = "id"
MAX_REVIEWS = 10000
BATCH_SIZE = 200
DATE_CUTOFF = datetime.now() - timedelta(days=90)  # Last 3 months

# Version/date tag for output files
RUN_DATE = datetime.now().strftime("%Y%m%d")

# Output directory
OUTPUT_DIR = "data/raw"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Date cutoff: {DATE_CUTOFF.strftime('%Y-%m-%d')}")
print(f"Max reviews per app: {MAX_REVIEWS}")
print(f"Output file tag: {RUN_DATE}")

In [None]:
def scrape_app_reviews(app_name, app_id):
    """Scrape reviews for a single app with date filtering."""
    all_reviews = []
    token = None
    
    pbar = tqdm(total=MAX_REVIEWS, desc=f"Scraping {app_name}")
    
    while len(all_reviews) < MAX_REVIEWS:
        try:
            batch, token = reviews(
                app_id,
                lang=LANG,
                country=COUNTRY,
                sort=Sort.NEWEST,
                count=BATCH_SIZE,
                continuation_token=token,
            )
        except Exception as e:
            print(f"\nError fetching {app_name}: {e}")
            break
        
        if not batch:
            print(f"\nNo more reviews for {app_name}.")
            break
        
        # Filter by date
        for review in batch:
            review_date = review["at"]
            if review_date >= DATE_CUTOFF:
                all_reviews.append(review)
            else:
                # Reviews are sorted newest first, so we can stop
                pbar.update(pbar.total - pbar.n)  # Fill progress bar
                pbar.close()
                print(f"Reached date cutoff for {app_name}. Got {len(all_reviews)} reviews.")
                return all_reviews
        
        pbar.update(len(batch))
        time.sleep(1)  # Rate limiting
    
    pbar.close()
    print(f"Finished {app_name}. Got {len(all_reviews)} reviews.")
    return all_reviews

In [None]:
# Scrape all apps
all_data = {}

for app_name, app_id in APPS.items():
    print(f"\n{'='*50}")
    print(f"Scraping: {app_name} ({app_id})")
    print(f"{'='*50}")
    
    app_reviews = scrape_app_reviews(app_name, app_id)
    all_data[app_name] = app_reviews
    
    print(f"Total reviews collected: {len(app_reviews)}")

In [None]:
# Convert to DataFrames and save
combined_dfs = []

for app_name, app_reviews in all_data.items():
    df = pd.DataFrame(app_reviews)
    df["app_name"] = app_name
    
    # Select and rename useful columns
    columns = [
        "reviewId", "userName", "content", "score", "thumbsUpCount",
        "reviewCreatedVersion", "at", "replyContent", "repliedAt", "app_name"
    ]
    df = df[[c for c in columns if c in df.columns]]
    
    # Save per-app CSV with date tag
    filename = f"{OUTPUT_DIR}/{app_name}_reviews_{RUN_DATE}.csv"
    df.to_csv(filename, index=False)
    print(f"Saved {len(df)} reviews to {filename}")
    
    combined_dfs.append(df)

# Save combined CSV
df_all = pd.concat(combined_dfs, ignore_index=True)

# Remove duplicates by reviewId
before = len(df_all)
df_all = df_all.drop_duplicates(subset="reviewId")
after = len(df_all)
if before != after:
    print(f"Removed {before - after} duplicate reviews.")

combined_file = f"{OUTPUT_DIR}/all_reviews_{RUN_DATE}.csv"
df_all.to_csv(combined_file, index=False)
print(f"\nSaved combined {len(df_all)} reviews to {combined_file}")

In [None]:
# Summary
print("\n" + "="*50)
print("SCRAPING SUMMARY")
print("="*50)
for app_name in APPS:
    count = len(df_all[df_all["app_name"] == app_name])
    print(f"{app_name}: {count} reviews")
print(f"\nTotal: {len(df_all)} reviews")
print(f"Date range: {df_all['at'].min()} to {df_all['at'].max()}")
print(f"Score distribution:\n{df_all['score'].value_counts().sort_index()}")