In [1]:
import pandas as pd
import os

In [2]:
# Get the absolute path to the data directory
abs_dir = os.path.abspath(os.path.dirname(os.getcwd()))

In [3]:
df = pd.read_csv(os.path.join(abs_dir, 'data/combanketh_reviews_20250608_162756.csv'))

In [4]:
pd.set_option('display.max_colwidth', None)

df.head(5)

Unnamed: 0,review_text,rating,upvote,date,bank_name,source
0,really am happy to this app it is Siple to use everything,5,0,2025-06-07,Commercial Bank of Ethiopia,Google Play
1,I liked this app. But the User interface is very basic and not attractive at all.,2,0,2025-06-07,Commercial Bank of Ethiopia,Google Play
2,"""Why don’t your ATMs support account-to-account transfers like other countries( Kenya, Nigeria , South africa)""",4,0,2025-06-06,Commercial Bank of Ethiopia,Google Play
3,what is this app problem???,1,0,2025-06-05,Commercial Bank of Ethiopia,Google Play
4,the app is proactive and a good connections.,5,0,2025-06-05,Commercial Bank of Ethiopia,Google Play


In [5]:
# check types 
df.dtypes

review_text    object
rating          int64
upvote          int64
date           object
bank_name      object
source         object
dtype: object

In [6]:
df.describe()

Unnamed: 0,rating,upvote
count,4000.0,4000.0
mean,4.1105,8.48825
std,1.474562,82.261194
min,1.0,0.0
25%,4.0,0.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,3025.0


In [7]:
# check for null values in the DataFrame
df.isnull().sum()

review_text    0
rating         0
upvote         0
date           0
bank_name      0
source         0
dtype: int64

### Remove Duplicate Entries

When I check for duplicate entries in general, I find 81 duplicate rows. However, when I define duplicates based on both review_text and rating, the number increases to 1,162 rows. This indicates that the most influential factors in identifying duplicates are the review text and the rating.

In [8]:
# Check for duplicated data
dublicated_data = df.duplicated(subset=['review_text', 'rating'], keep=False).sum()
print(f'data duplicated: {dublicated_data}')
df[df.duplicated()]

data duplicated: 1162


Unnamed: 0,review_text,rating,upvote,date,bank_name,source
15,good,5,0,2025-06-04,Commercial Bank of Ethiopia,Google Play
74,good,5,0,2025-05-23,Commercial Bank of Ethiopia,Google Play
76,good,5,0,2025-05-23,Commercial Bank of Ethiopia,Google Play
86,ok,5,0,2025-05-22,Commercial Bank of Ethiopia,Google Play
103,best,5,0,2025-05-21,Commercial Bank of Ethiopia,Google Play
...,...,...,...,...,...,...
3381,Good,5,1,2024-02-11,Commercial Bank of Ethiopia,Google Play
3396,Good,5,1,2024-02-11,Commercial Bank of Ethiopia,Google Play
3409,Good,5,1,2024-02-09,Commercial Bank of Ethiopia,Google Play
3641,Good,5,1,2024-01-05,Commercial Bank of Ethiopia,Google Play


In [9]:
# remove duplicated data
df.drop_duplicates(subset=['review_text', 'rating'], keep='first', inplace=True)

In [10]:
# change date format from object to 'YYYY-MM-DD' format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df.dtypes

review_text            object
rating                  int64
upvote                  int64
date           datetime64[ns]
bank_name              object
source                 object
dtype: object

In [11]:
df.shape

(2988, 6)

In [None]:
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Apply sentiment to review_text column
sentiment_result = df['review_text'].astype(str).apply(lambda x: sentiment_pipeline(x)[0])
df['sentiment_label'] = df['sentiment_result'].apply(lambda x: x['label'])
df['sentiment_score'] = df['sentiment_result'].apply(lambda x: x['score'])


Device set to use cpu


In [14]:
df.drop(['sentiment_label', "sentiment_score", "sentiment_result"], axis=1, inplace=True)


In [15]:
df.head(5)

Unnamed: 0,review_text,rating,upvote,date,bank_name,source
0,really am happy to this app it is Siple to use everything,5,0,2025-06-07,Commercial Bank of Ethiopia,Google Play
1,I liked this app. But the User interface is very basic and not attractive at all.,2,0,2025-06-07,Commercial Bank of Ethiopia,Google Play
2,"""Why don’t your ATMs support account-to-account transfers like other countries( Kenya, Nigeria , South africa)""",4,0,2025-06-06,Commercial Bank of Ethiopia,Google Play
3,what is this app problem???,1,0,2025-06-05,Commercial Bank of Ethiopia,Google Play
4,the app is proactive and a good connections.,5,0,2025-06-05,Commercial Bank of Ethiopia,Google Play
