# Task 1: Data Preparation and Cleaning 

In [1]:
import sys
import os
import pandas as pd
from glob import glob

In [2]:
from pathlib import Path
from importlib import reload
# add the project root to the path
project_root = Path("..").resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

## Web Scraping

In [3]:
# import the scraper module
from src.play_store_scraper import PlayStoreScraper
import src.play_store_scraper
# reload the module to ensure we have the latest version
reload(src.play_store_scraper)

<module 'src.play_store_scraper' from 'D:\\Research & Project\\10academy\\week 2\\challenge\\CX-analytics-for-fintech-apps\\src\\play_store_scraper.py'>

In [4]:
# Define app IDs and corresponding bank names
banks = {
    'com.combanketh.mobilebanking': 'CBE',
    'com.dashen.dashensuperapp': 'Dashen',
    'com.boa.boaMobileBanking': 'BOA'
}

In [5]:
# Loop through each bank and scrape
for app_id, bank_name in banks.items():
    num_reviews = 1000  # Number of reviews to scrape
    print(f"Scraping for: {bank_name}")
    scraper = PlayStoreScraper(app_id)
    reviews = scraper.get_reviews(num_reviews=num_reviews)
    scraper.save_reviews_to_csv(reviews, bank_name)
    print(f"Done scraping for: {bank_name}\n")

Scraping for: CBE
Done scraping for: CBE

Scraping for: Dashen
Done scraping for: Dashen

Scraping for: BOA
Done scraping for: BOA



## Preprocessing Steps

#### Loading the Data

In [6]:
# In this task, we will prepare and clean the dataset for further analysis.
# First, we will load the module for data loading 
from src.data_loader import DataLoader
import src.data_loader
reload(src.data_loader)

<module 'src.data_loader' from 'D:\\Research & Project\\10academy\\week 2\\challenge\\CX-analytics-for-fintech-apps\\src\\data_loader.py'>

In [7]:
# initialize the data loader for the raw data directory
# Define path to your raw data folder
raw_data_dir = '../data/raw'
loader = DataLoader(raw_data_dir)
# List all CSV files in the raw data directory
csv_files=[f for f in os.listdir(raw_data_dir) if f.endswith('.csv')]
# Load all CSV files into a single Dictionary of DataFrames
bank_reviews = {}

# Load each CSV and store in the dictionary
for filename in csv_files:
    try:
        df = loader.load_data(filename)
        # Extract bank name from filename (e.g., "CBE_reviews_20250607_153045.csv")
        bank_name = filename.split('_reviews_')[0]
        bank_reviews[bank_name] = df
        print(f"Loaded {filename} with {len(df)} records.")
    except Exception as e:
        print(f"Failed to load {filename}: {e}")

# Example: Display the first few rows of CBE data
bank_reviews['CBE'].head(5)

Loaded BOA_reviews_20250608_014031.csv with 1000 records.
Loaded CBE_reviews_20250608_014013.csv with 1000 records.
Loaded Dashen_reviews_20250608_014014.csv with 448 records.


Unnamed: 0,review_text,rating,date,bank_name,source
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,CBE,Google Play
1,what is this app problem???,1,2025-06-05,CBE,Google Play
2,the app is proactive and a good connections.,5,2025-06-05,CBE,Google Play
3,I cannot send to cbebirr app. through this app.,3,2025-06-05,CBE,Google Play
4,good,4,2025-06-05,CBE,Google Play


In [8]:
# describe the data for each bank
for bank_name, df in bank_reviews.items():
    print(f"\n{bank_name} Data Description:")
    print(df.describe(include='all'))  # include='all' to get stats for all columns
    print(f"Total reviews: {len(df)}")
    print(f"Columns: {list(df.columns)}\n")


BOA Data Description:
       review_text       rating        date bank_name       source
count         1000  1000.000000        1000      1000         1000
unique         856          NaN         367         1            1
top           Good          NaN  2024-05-02       BOA  Google Play
freq            43          NaN          23      1000         1000
mean           NaN     3.020000         NaN       NaN          NaN
std            NaN     1.859855         NaN       NaN          NaN
min            NaN     1.000000         NaN       NaN          NaN
25%            NaN     1.000000         NaN       NaN          NaN
50%            NaN     3.000000         NaN       NaN          NaN
75%            NaN     5.000000         NaN       NaN          NaN
max            NaN     5.000000         NaN       NaN          NaN
Total reviews: 1000
Columns: ['review_text', 'rating', 'date', 'bank_name', 'source']


CBE Data Description:
       review_text       rating        date bank_name       sou

#### Removing Duplicates and Handling Missing Values

In [9]:
# Load the preprocessing module
from src.preprocessing import Preprocessor
import src.preprocessing
# Reload the module to ensure we have the latest version
reload(src.preprocessing)

<module 'src.preprocessing' from 'D:\\Research & Project\\10academy\\week 2\\challenge\\CX-analytics-for-fintech-apps\\src\\preprocessing.py'>

In [10]:

# Define input/output directories
processed_data_dir = '../data/processed'
os.makedirs(processed_data_dir, exist_ok=True)

In [12]:
# preprocess each bank's reviews
# store in single df 

!python -m spacy download en_core_web_sm # Ensure spaCy model is downloaded to remove non-English reviews

# To hold all preprocessed DataFrames
all_preprocessed_dfs = []
for bank_name, df in bank_reviews.items():
    print(f"Preprocessing data for {bank_name}...")
    preprocessor = Preprocessor(df)
    preprocessed_df = (
        preprocessor.remove_duplicates()
        .handle_missing_values()
        .remove_non_english_reviews()
        .get_processed_data()
    )

    

    # Save the preprocessed data
    output_file = os.path.join(processed_data_dir, f"{bank_name}_preprocessed.csv")
    preprocessed_df.to_csv(output_file, index=False)
    print(f"Saved preprocessed data for {bank_name} to {output_file}\n")
    
    # Append to the list of all preprocessed DataFrames
    all_preprocessed_dfs.append(preprocessed_df)
# Combine all preprocessed DataFrames into a single DataFrame
combined_df = pd.concat(all_preprocessed_dfs, ignore_index=True)
# Save the combined DataFrame
combined_output_file = os.path.join(processed_data_dir, "combined_preprocessed.csv")
combined_df.to_csv(combined_output_file, index=False)
print(f"Saved combined preprocessed data to {combined_output_file}\n")


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


Preprocessing data for BOA...
Saved preprocessed data for BOA to ../data/processed\BOA_preprocessed.csv

Preprocessing data for CBE...
Saved preprocessed data for CBE to ../data/processed\CBE_preprocessed.csv

Preprocessing data for Dashen...
Saved preprocessed data for Dashen to ../data/processed\Dashen_preprocessed.csv

Saved combined preprocessed data to ../data/processed\combined_preprocessed.csv



In [13]:
combined_df

Unnamed: 0,review_text,rating,date,bank_name,source
0,"Hello, I’m facing a problem with the BOA Mobil...",1,2025-06-03,BOA,Google Play
1,this is worest app 24/7 loading,1,2025-06-01,BOA,Google Play
2,This App is not interest for Android phone Ple...,1,2025-06-01,BOA,Google Play
3,BoA system is confartable,5,2025-06-01,BOA,Google Play
4,very nice Abyssinia bank is choice all,5,2025-05-31,BOA,Google Play
...,...,...,...,...,...
1425,App That makes Difference!,5,2025-01-14,Dashen,Google Play
1426,"Waw Great and innovated,user friendly, always ...",5,2025-01-13,Dashen,Google Play
1427,It's Best waww 🙏,5,2025-01-13,Dashen,Google Play
1428,Always one step ahead,5,2025-01-13,Dashen,Google Play


In [17]:
# Data of  each bank after preprocessing 
for bank_name in banks.values():
    bank_df = combined_df[combined_df['bank_name'] == bank_name]
    print(f"\n{bank_name} Preprocessed Data Description:")
    print(bank_df.describe(include='all'))
    print(f"Total reviews: {len(bank_df)}")
    print(f"Columns: {list(bank_df.columns)}\n")




CBE Preprocessed Data Description:
                            review_text      rating        date bank_name  \
count                               502  502.000000         502       502   
unique                              502         NaN         129         1   
top     wow . what i can say thank you.         NaN  2025-03-31       CBE   
freq                                  1         NaN          25       502   
mean                                NaN    3.735060         NaN       NaN   
std                                 NaN    1.624269         NaN       NaN   
min                                 NaN    1.000000         NaN       NaN   
25%                                 NaN    2.000000         NaN       NaN   
50%                                 NaN    5.000000         NaN       NaN   
75%                                 NaN    5.000000         NaN       NaN   
max                                 NaN    5.000000         NaN       NaN   

             source  
count           5