# 01 - Scrape Google Play Reviews

Scrape reviews for 3 Indonesian telco apps:
- **MyTelkomsel** (`com.telkomsel.telkomselcm`)
- **myXL** (`com.apps.MyXL`)
- **myIM3** (`com.pure.indosat.care`)

**Parameters:** English language, Indonesia country, max 10,000 reviews per app, last 3 months.

In [None]:
import pandas as pd
from google_play_scraper import Sort, reviews
from tqdm import tqdm
from datetime import datetime, timedelta
import os
import time
import re
import unicodedata
from langdetect import detect, LangDetectException

In [None]:
# App configurations
APPS = {
    "MyTelkomsel": "com.telkomsel.telkomselcm",
    "myXL": "com.apps.MyXL",
    "myIM3": "com.pure.indosat.care",
}

# Scraping parameters
LANG = "en"
COUNTRY = "id"
MAX_REVIEWS = 10000
BATCH_SIZE = 200
DATE_CUTOFF = datetime.now() - timedelta(days=90)  # Last 3 months

# Version/date tag for output files
RUN_DATE = datetime.now().strftime("%Y%m%d")

# Output directory
OUTPUT_DIR = "data/raw"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Date cutoff: {DATE_CUTOFF.strftime('%Y-%m-%d')}")
print(f"Max reviews per app: {MAX_REVIEWS}")
print(f"Output file tag: {RUN_DATE}")

In [None]:
def clean_text(text):
    """Clean review text."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Normalize unicode characters
    text = unicodedata.normalize("NFKD", text)
    
    # Replace newlines and tabs with space
    text = re.sub(r"[\n\r\t]+", " ", text)
    
    # Remove emojis and special unicode symbols (keep basic punctuation)
    text = re.sub(
        r"[\U00010000-\U0010ffff]",  # Supplementary Unicode planes (emojis etc)
        "",
        text,
    )
    
    # Remove excessive punctuation repetition (e.g., "!!!!!!" -> "!")
    text = re.sub(r"([!?.])\1{2,}", r"\1", text)
    
    # Collapse multiple spaces
    text = re.sub(r"\s{2,}", " ", text)
    
    return text.strip()


def detect_language(text):
    """Detect language of text. Returns language code or 'unknown'."""
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

In [None]:
def scrape_app_reviews(app_name, app_id):
    """Scrape reviews for a single app with date filtering."""
    all_reviews = []
    token = None
    
    pbar = tqdm(total=MAX_REVIEWS, desc=f"Scraping {app_name}")
    
    while len(all_reviews) < MAX_REVIEWS:
        try:
            batch, token = reviews(
                app_id,
                lang=LANG,
                country=COUNTRY,
                sort=Sort.NEWEST,
                count=BATCH_SIZE,
                continuation_token=token,
            )
        except Exception as e:
            print(f"\nError fetching {app_name}: {e}")
            break
        
        if not batch:
            print(f"\nNo more reviews for {app_name}.")
            break
        
        # Filter by date
        for review in batch:
            review_date = review["at"]
            if review_date >= DATE_CUTOFF:
                all_reviews.append(review)
            else:
                # Reviews are sorted newest first, so we can stop
                pbar.update(pbar.total - pbar.n)  # Fill progress bar
                pbar.close()
                print(f"Reached date cutoff for {app_name}. Got {len(all_reviews)} reviews.")
                return all_reviews
        
        pbar.update(len(batch))
        time.sleep(1)  # Rate limiting
    
    pbar.close()
    print(f"Finished {app_name}. Got {len(all_reviews)} reviews.")
    return all_reviews

In [None]:
# Scrape all apps
all_data = {}

for app_name, app_id in APPS.items():
    print(f"\n{'='*50}")
    print(f"Scraping: {app_name} ({app_id})")
    print(f"{'='*50}")
    
    app_reviews = scrape_app_reviews(app_name, app_id)
    all_data[app_name] = app_reviews
    
    print(f"Total reviews collected: {len(app_reviews)}")

In [None]:
# Convert to DataFrames, clean, and save
PROCESSED_DIR = "data/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

combined_dfs = []

for app_name, app_reviews in all_data.items():
    df = pd.DataFrame(app_reviews)
    df["app_name"] = app_name
    
    # Select useful columns
    columns = [
        "reviewId", "userName", "content", "score", "thumbsUpCount",
        "reviewCreatedVersion", "at", "replyContent", "repliedAt", "app_name"
    ]
    df = df[[c for c in columns if c in df.columns]]
    
    # Preprocessing: drop null/empty content
    df = df.dropna(subset=["content"])
    df = df[df["content"].str.strip() != ""]
    
    # Clean text and detect language
    df["content_clean"] = df["content"].apply(clean_text)
    df = df[df["content_clean"].str.len() > 0]
    
    print(f"Detecting language for {app_name}...")
    df["language"] = df["content_clean"].apply(detect_language)
    
    # Save per-app CSV
    filename = f"{OUTPUT_DIR}/{app_name}_reviews_{RUN_DATE}.csv"
    df.to_csv(filename, index=False)
    print(f"Saved {len(df)} reviews to {filename}")
    
    combined_dfs.append(df)

# Combined DataFrame
df_all = pd.concat(combined_dfs, ignore_index=True)

# Remove duplicates by reviewId
before = len(df_all)
df_all = df_all.drop_duplicates(subset="reviewId")
after = len(df_all)
if before != after:
    print(f"Removed {before - after} duplicate reviews.")

# Save combined raw (with cleaned columns)
combined_file = f"{OUTPUT_DIR}/all_reviews_{RUN_DATE}.csv"
df_all.to_csv(combined_file, index=False)
print(f"\nSaved combined {len(df_all)} reviews to {combined_file}")

# Save processed files (compatible with 03_sentiment_analysis.ipynb)
cleaned_file = f"{PROCESSED_DIR}/reviews_cleaned_{RUN_DATE}.csv"
df_all.to_csv(cleaned_file, index=False)
print(f"Saved {len(df_all)} cleaned reviews to {cleaned_file}")

# Save English-only subset
df_en = df_all[df_all["language"] == "en"]
en_file = f"{PROCESSED_DIR}/reviews_english_{RUN_DATE}.csv"
df_en.to_csv(en_file, index=False)
print(f"Saved {len(df_en)} English reviews to {en_file}")

In [None]:
# Summary
print("\n" + "="*50)
print("SCRAPING & PREPROCESSING SUMMARY")
print("="*50)
for app_name in APPS:
    total = len(df_all[df_all["app_name"] == app_name])
    en = len(df_en[df_en["app_name"] == app_name])
    print(f"{app_name}: {total} total, {en} English")

print(f"\nTotal: {len(df_all)} reviews")
print(f"English: {len(df_en)} reviews ({len(df_en)/len(df_all)*100:.1f}%)")
print(f"Date range: {df_all['at'].min()} to {df_all['at'].max()}")
print(f"\nScore distribution:\n{df_all['score'].value_counts().sort_index()}")
print(f"\nLanguage distribution (top 10):\n{df_all['language'].value_counts().head(10)}")
print(f"\nOutput files:")
print(f"  Raw:     {combined_file}")
print(f"  Cleaned: {cleaned_file}")
print(f"  English: {en_file}")