In [45]:
import os
import emoji
import re
import string
import spacy
import pandas as pd
from transformers import pipeline

In [2]:
# Get the absolute path to the data directory
abs_dir = os.path.abspath(os.path.dirname(os.getcwd()))

In [3]:
df = pd.read_csv(os.path.join(abs_dir, 'data/merged_dataset.csv'))

In [5]:
pd.set_option('display.max_colwidth', None)

df.sample(10)

Unnamed: 0,review_text,rating,upvote,date,bank_name,source
3459,"Very nice, but requires paid network.",3,3,2024-03-04,Commercial Bank of Ethiopia,Google Play
5401,"The app does not work well from abroad, there have been continuous upgrades and updates that are very interruptive and non-responsive. Staff do not have the technical capabilities to support whereas the support number and online support are never available.",1,75,2024-02-12,Bank of Abyssinia,Google Play
1813,"This app is the best, easy and more clear to use including its features to understand so i liked it 100% because the app made the life of customers easy. Thanks cbe and the app.",5,705,2024-12-27,Commercial Bank of Ethiopia,Google Play
1351,Its good but we need an option of choosing a specific period to get a historique of transactions. For example when i want to check my previous transactions i only get few transactions. Thank you.,2,37,2025-01-30,Commercial Bank of Ethiopia,Google Play
285,the worst app in the market only good ui,1,2,2025-03-22,Dashen Bank,Google Play
5331,An excellent app.,5,1,2024-03-01,Bank of Abyssinia,Google Play
414,Always one step forward,5,3,2025-01-15,Dashen Bank,Google Play
4019,Nice app,5,1,2024-01-17,Commercial Bank of Ethiopia,Google Play
3356,👍,5,1,2024-03-13,Commercial Bank of Ethiopia,Google Play
4176,The best way to get rid of my life thank you so much ❤️,5,1,2023-12-22,Commercial Bank of Ethiopia,Google Play


In [6]:
# check types 
df.dtypes

review_text    object
rating          int64
upvote          int64
date           object
bank_name      object
source         object
dtype: object

In [7]:
# change date format from object to 'YYYY-MM-DD' format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df.dtypes

review_text            object
rating                  int64
upvote                  int64
date           datetime64[ns]
bank_name              object
source                 object
dtype: object

In [8]:
df.describe()

Unnamed: 0,rating,upvote,date
count,5493.0,5493.0,5493
mean,3.940834,8.674495,2024-08-31 16:32:14.680502272
min,1.0,0.0,2023-11-06 00:00:00
25%,3.0,0.0,2024-03-25 00:00:00
50%,5.0,1.0,2024-08-27 00:00:00
75%,5.0,2.0,2025-01-31 00:00:00
max,5.0,3025.0,2025-06-07 00:00:00
std,1.596825,78.129692,


In [9]:
# check for null values in the DataFrame
df.isnull().sum()

review_text    0
rating         0
upvote         0
date           0
bank_name      0
source         0
dtype: int64

In [12]:
# Check for duplicated data
dublicated_data = df.duplicated(subset=['review_text', 'rating']).sum()
print(f'data duplicated: {dublicated_data}')
df[df.duplicated()]

data duplicated: 1306


Unnamed: 0,review_text,rating,upvote,date,bank_name,source
214,best mobile banking application,5,2,2025-04-21,Dashen Bank,Google Play
464,good,5,0,2025-06-04,Commercial Bank of Ethiopia,Google Play
523,good,5,0,2025-05-23,Commercial Bank of Ethiopia,Google Play
525,good,5,0,2025-05-23,Commercial Bank of Ethiopia,Google Play
535,ok,5,0,2025-05-22,Commercial Bank of Ethiopia,Google Play
...,...,...,...,...,...,...
4131,Good,5,1,2023-12-30,Commercial Bank of Ethiopia,Google Play
4480,good,4,0,2025-04-30,Bank of Abyssinia,Google Play
5093,best,5,1,2024-05-02,Bank of Abyssinia,Google Play
5213,Good,5,1,2024-04-22,Bank of Abyssinia,Google Play


In [13]:
# remove duplicated data
df.drop_duplicates(subset=['review_text', 'rating'], inplace=True)

In [16]:
df.shape

(4187, 6)

In [18]:
# How to transfor emojis to text as a preprocessing step
def preprocess_text(text):
    # Convert emojis to text
    text = emoji.demojize(text, language='en')
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation (excluding emoji placeholders like :smile:)
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [23]:
df['review_text'] = df['review_text'].apply(preprocess_text)

In [43]:
# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

sentiment_results = df['review_text'].astype(str).apply(lambda x: sentiment_pipeline(x)[0])

df[['sentiment_label', 'sentiment_score']] = df['review_text'].astype(str).apply(
    lambda x: pd.Series({
        'sentiment_label': sentiment_pipeline(x)[0]['label'],
        'sentiment_score': sentiment_pipeline(x)[0]['score']
    })
)

Device set to use cpu


In [44]:
df


Unnamed: 0,review_text,rating,upvote,date,bank_name,source,sentiment_label,sentiment_score
0,i like this mobile banking app very much overall user interface and navigation is awesome but it lacks instant response when someone deposit or withdraw money,2,0,2025-06-07,Dashen Bank,Google Play,NEGATIVE,0.998102
1,love,3,0,2025-06-06,Dashen Bank,Google Play,POSITIVE,0.999874
2,መቸሸጠ,5,0,2025-06-03,Dashen Bank,Google Play,NEGATIVE,0.697056
3,wow,5,0,2025-06-03,Dashen Bank,Google Play,POSITIVE,0.999592
4,gadaa,5,0,2025-06-01,Dashen Bank,Google Play,NEGATIVE,0.721146
...,...,...,...,...,...,...,...,...
5485,nice one,5,5,2024-01-14,Bank of Abyssinia,Google Play,POSITIVE,0.999848
5486,በጣም ደስ የሚል ለውጥ አናመሰግናለን ethiopia,5,10,2024-01-14,Bank of Abyssinia,Google Play,NEGATIVE,0.547949
5487,best application thumbs up thank you abyssinia bank,5,12,2024-01-12,Bank of Abyssinia,Google Play,POSITIVE,0.999482
5490,absolutely it s fantastic apps this new apps it s fast and good apps,5,27,2024-01-11,Bank of Abyssinia,Google Play,POSITIVE,0.999886


In [46]:
nlp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.