In [1]:
import pandas as pd 
import numpy as numpy
import matplotlib.pyplot as plt 
import seaborn as sns

import datetime
import logging

import pathlib as Path
import os
import sys

In [7]:
from src.scrape_reviews import scrape_app_reviews
from src.clean_reviews import load_raw_data, remove_duplicates, preprocess_reviews

ModuleNotFoundError: No module named 'src'

"""
Bank App Reviews Collection and Analysis

This script collects and analyzes reviews from three Ethiopian banking apps:
- Commercial Bank of Ethiopia (CBE)
- Bank of Abyssinia (BOA)
- Dashen Bank

It collects 400+ reviews per bank and analyzes the data.
"""

In [None]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)



# Set plot style
plt.style.use('seaborn')
sns.set_palette('husl')

In [None]:
# Define target banks and their app IDs
BANK_APPS = {
    'CBE': 'com.cbe.mobilebanking',
    'BOA': 'com.boa.mobilebanking',
    'Dashen': 'com.dashenbank.mobilebanking'
}

def collect_reviews():
    """Collect reviews for all target banks."""
    all_reviews = []
    
    for bank_name, app_id in BANK_APPS.items():
        try:
            print(f"\nCollecting reviews for {bank_name}...")
            reviews = scrape_app_reviews(app_id, bank_name, count=400)
            all_reviews.extend(reviews)
            print(f"✓ Collected {len(reviews)} reviews for {bank_name}")
        except Exception as e:
            print(f"✗ Failed to collect reviews for {bank_name}: {str(e)}")
    
    return all_reviews

In [None]:

def analyze_and_visualize(df):
    """Perform data analysis and visualization."""
    # Reviews per bank
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='bank')
    plt.title('Number of Reviews per Bank')
    plt.xlabel('Bank')
    plt.ylabel('Number of Reviews')
    plt.xticks(rotation=45)
    plt.show()
    
    # Rating distribution
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=df, x='bank', y='rating')
    plt.title('Rating Distribution by Bank')
    plt.xlabel('Bank')
    plt.ylabel('Rating')
    plt.xticks(rotation=45)
    plt.show()
    
    # Reviews over time
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.to_period('M')
    
    plt.figure(figsize=(15, 6))
    for bank in df['bank'].unique():
        bank_data = df[df['bank'] == bank]
        monthly_counts = bank_data.groupby('month').size()
        plt.plot(monthly_counts.index.astype(str), monthly_counts.values, label=bank, marker='o')
    
    plt.title('Reviews Over Time')
    plt.xlabel('Month')
    plt.ylabel('Number of Reviews')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:

def main():
    print("Target Banks:")
    for bank, app_id in BANK_APPS.items():
        print(f"- {bank}: {app_id}")
    
    # Collect reviews
    all_reviews = collect_reviews()
    
    # Convert to DataFrame
    df = pd.DataFrame(all_reviews)
    
    # Select and rename columns
    df = df[['app_name', 'content', 'score', 'at']]
    df.columns = ['bank', 'review', 'rating', 'date']
    
    # Add source column
    df['source'] = 'Google Play Store'
    
    # Process the data
    df = preprocess_reviews(df)
    
    # Remove duplicates
    df = remove_duplicates(df)
    
    # Normalize dates to YYYY-MM-DD format
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    
    print(f"Total reviews after processing: {len(df)}")
    print(df.head())
    
    # Perform analysis and visualization
    analyze_and_visualize(df)
    
    # Create output directory
    output_dir = Path("data/processed")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Save to CSV
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = output_dir / f"bank_reviews_{timestamp}.csv"
    df.to_csv(output_file, index=False)
    print(f"Saved {len(df)} reviews to {output_file}")
    
    # Print summary statistics
    print("\nReview Collection Summary:")
    print("-" * 30)
    for bank in df['bank'].unique():
        bank_reviews = df[df['bank'] == bank]
        print(f"{bank}:")
        print(f"  Reviews: {len(bank_reviews)}")
        print(f"  Average rating: {bank_reviews['rating'].mean():.2f}")
        print(f"  Date range: {bank_reviews['date'].min()} to {bank_reviews['date'].max()}")
        print("-" * 30)

if __name__ == "__main__":
    main()