# Task 1: Data Preparation and Cleaning 

In [32]:
import sys
import os
import pandas as pd

In [33]:
from pathlib import Path
from importlib import reload
# add the project root to the path
project_root = Path("..").resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

## Web Scraping

In [2]:
# import the scraper module
from src.play_store_scraper import PlayStoreScraper
import src.play_store_scraper
# reload the module to ensure we have the latest version
reload(src.play_store_scraper)

<module 'src.play_store_scraper' from 'D:\\Research & Project\\10academy\\week 2\\challenge\\CX-analytics-for-fintech-apps\\src\\play_store_scraper.py'>

In [3]:
# Define app IDs and corresponding bank names
banks = {
    'com.combanketh.mobilebanking': 'CBE',
    'com.dashen.dashensuperapp': 'Dashen',
    'com.boa.boaMobileBanking': 'BOA'
}

In [4]:
# Loop through each bank and scrape
for app_id, bank_name in banks.items():
    print(f"Scraping for: {bank_name}")
    scraper = PlayStoreScraper(app_id)
    reviews = scraper.get_reviews(num_reviews=1000)  # or 4000 as needed
    scraper.save_reviews_to_csv(reviews, bank_name)
    print(f"Done scraping for: {bank_name}\n")

Scraping for: CBE
Done scraping for: CBE

Scraping for: Dashen
Done scraping for: Dashen

Scraping for: BOA
Done scraping for: BOA



## Preprocessing Steps

#### Loading the Data

In [34]:
# In this task, we will prepare and clean the dataset for further analysis.
# First, we will load the module for data loading 
from src.data_loader import DataLoader
import src.data_loader
reload(src.data_loader)

<module 'src.data_loader' from 'D:\\Research & Project\\10academy\\week 2\\challenge\\CX-analytics-for-fintech-apps\\src\\data_loader.py'>

In [35]:
# initialize the data loader for the raw data directory
# Define path to your raw data folder
data_dir = '../data/raw'
loader = DataLoader(data_dir)
# List all CSV files in the raw data directory
csv_files=[f for f in os.listdir(data_dir) if f.endswith('.csv')]
# Load all CSV files into a single Dictionary of DataFrames
bank_reviews = {}

# Load each CSV and store in the dictionary
for filename in csv_files:
    try:
        df = loader.load_data(filename)
        # Extract bank name from filename (e.g., "CBE_reviews_20250607_153045.csv")
        bank_name = filename.split('_reviews_')[0]
        bank_reviews[bank_name] = df
        print(f"Loaded {filename} with {len(df)} records.")
    except Exception as e:
        print(f"Failed to load {filename}: {e}")

# Example: Display the first few rows of CBE data
bank_reviews['CBE'].head(5)

Loaded BOA_reviews_20250607_230525.csv with 1000 records.
Loaded CBE_reviews_20250607_230522.csv with 1000 records.
Loaded Dashen_reviews_20250607_230524.csv with 448 records.


Unnamed: 0,review_text,rating,date,bank_name,source
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,CBE,Google Play
1,what is this app problem???,1,2025-06-05,CBE,Google Play
2,the app is proactive and a good connections.,5,2025-06-05,CBE,Google Play
3,I cannot send to cbebirr app. through this app.,3,2025-06-05,CBE,Google Play
4,good,4,2025-06-05,CBE,Google Play


In [36]:
# describe the data for each bank
for bank_name, df in bank_reviews.items():
    print(f"\n{bank_name} Data Description:")
    print(df.describe(include='all'))  # include='all' to get stats for all columns
    print(f"Total reviews: {len(df)}")
    print(f"Columns: {list(df.columns)}\n")


BOA Data Description:
       review_text       rating        date bank_name       source
count         1000  1000.000000        1000      1000         1000
unique         856          NaN         367         1            1
top           Good          NaN  2024-05-02       BOA  Google Play
freq            43          NaN          23      1000         1000
mean           NaN     3.020000         NaN       NaN          NaN
std            NaN     1.859855         NaN       NaN          NaN
min            NaN     1.000000         NaN       NaN          NaN
25%            NaN     1.000000         NaN       NaN          NaN
50%            NaN     3.000000         NaN       NaN          NaN
75%            NaN     5.000000         NaN       NaN          NaN
max            NaN     5.000000         NaN       NaN          NaN
Total reviews: 1000
Columns: ['review_text', 'rating', 'date', 'bank_name', 'source']


CBE Data Description:
       review_text       rating        date bank_name       sou

#### Removing Duplicates and Handling Missing Values

In [37]:
# Load the preprocessing module
from src.preprocessing import Preprocessor
import src.preprocessing
# Reload the module to ensure we have the latest version
reload(src.preprocessing)

<module 'src.preprocessing' from 'D:\\Research & Project\\10academy\\week 2\\challenge\\CX-analytics-for-fintech-apps\\src\\preprocessing.py'>