In [1]:
"""
Script for scraping banking app reviews from Google Play Store
and uploading additional CSV review files.
"""

import os
import pandas as pd
from google_play_scraper import Sort, reviews
from typing import List, Dict, Any


# Correct Google Play App IDs
BANK_APPS = {
    'CBE': 'com.combanketh.mobilebanking',
    'BOA': 'com.boa.boaMobileBanking',
    'Dashen': 'com.dashen.dashensuperapp'
}

def scrape_app_reviews(apps: Dict[str, str], reviews_per_app: int = 400) -> List[Dict[str, Any]]:
    """
    Scrape reviews from Google Play Store for given apps.
    """
    all_reviews = []

    for app_name, app_id in apps.items():
        print(f"[INFO] Scraping {app_name} ({app_id})")
        app_reviews = []
        count = 0
        next_token = None

        while count < reviews_per_app:
            rvs, next_token = reviews(
                app_id,
                lang='en',
                country='us',
                sort=Sort.NEWEST,
                count=200,
                continuation_token=next_token
            )
            for r in rvs:
                app_reviews.append({
                    'bank': app_name,
                    'review': r['content'],
                    'rating': r['score'],
                    'date': r['at'].isoformat(),
                    'source': 'Google Play'
                })
            count += len(rvs)
            if not next_token:
                break

        all_reviews.extend(app_reviews[:reviews_per_app])

    return all_reviews

def save_reviews_to_csv(reviews_data: List[Dict[str, Any]], filename: str = "raw_reviews.csv"):
    df = pd.DataFrame(reviews_data)
    df.to_csv(filename, index=False)
    print(f"[INFO] Saved {len(df)} scraped reviews to {filename}")

def load_additional_reviews(folder: str = "external_reviews") -> pd.DataFrame:
    """
    Load additional CSVs containing review data from a folder.
    Each CSV must include: review, rating, date, bank
    """
    print(f"[INFO] Looking for additional review files in '{folder}'")
    all_frames = []
    if not os.path.exists(folder):
        print("[INFO] No additional review folder found.")
        return pd.DataFrame()

    for file in os.listdir(folder):
        if file.endswith(".csv"):
            path = os.path.join(folder, file)
            try:
                df = pd.read_csv(path)
                df['source'] = 'External'
                all_frames.append(df)
                print(f"[INFO] Loaded {len(df)} reviews from {file}")
            except Exception as e:
                print(f"[WARNING] Skipping {file}: {e}")

    return pd.concat(all_frames, ignore_index=True) if all_frames else pd.DataFrame()

def clean_and_merge(scraped: pd.DataFrame, additional: pd.DataFrame, output: str = "all_reviews_cleaned.csv"):
    print("[INFO] Cleaning and merging data...")
    combined = pd.concat([scraped, additional], ignore_index=True)

    # Drop duplicates and missing reviews
    combined.drop_duplicates(subset='review', inplace=True)
    combined.dropna(subset=['review', 'rating', 'date', 'bank'], inplace=True)

    # Normalize date
    combined['date'] = pd.to_datetime(combined['date'], errors='coerce').dt.date
    combined.dropna(subset=['date'], inplace=True)

    combined = combined[['review', 'rating', 'date', 'bank', 'source']]
    combined.to_csv(output, index=False)
    print(f"[INFO] Final cleaned dataset saved to {output} ({len(combined)} reviews)")

def main():
    print("[INFO] Starting scraping and merge process...")
    
    # Step 1: Scrape from Google Play
    scraped_data = scrape_app_reviews(BANK_APPS, reviews_per_app=400)
    scraped_df = pd.DataFrame(scraped_data)
    save_reviews_to_csv(scraped_data, "scraped_reviews.csv")

    # Step 2: Load additional reviews (optional)
    additional_df = load_additional_reviews()

    # Step 3: Clean and merge
    clean_and_merge(scraped_df, additional_df)

    print("[INFO] All done.")

if __name__ == "__main__":
    main()


[INFO] Starting scraping and merge process...
[INFO] Scraping CBE (com.combanketh.mobilebanking)
[INFO] Scraping BOA (com.boa.boaMobileBanking)
[INFO] Scraping Dashen (com.dashen.dashensuperapp)
[INFO] Saved 1200 scraped reviews to scraped_reviews.csv
[INFO] Looking for additional review files in 'external_reviews'
[INFO] No additional review folder found.
[INFO] Cleaning and merging data...
[INFO] Final cleaned dataset saved to all_reviews_cleaned.csv (988 reviews)
[INFO] All done.
