In [1]:
import pandas as pd
import os

In [2]:
# Get the absolute path to the data directory
abs_dir = os.path.abspath(os.path.dirname(os.getcwd()))

In [3]:
df = pd.read_csv(os.path.join(abs_dir, 'data/boa_reviews_20250608_162800.csv'))

In [4]:
pd.set_option('display.max_colwidth', None)

df.head(5)

Unnamed: 0,review_text,rating,upvote,date,bank_name,source
0,it's not working,3,0,2025-06-05,Bank of Abyssinia,Google Play
1,"Hello, I’m facing a problem with the BOA Mobile app. Every time I enter my phone number and password, the app crashes and shows an error that says “BoaMobile closed because this app has a bug.” I tried updating, reinstalling, and clearing cache, but nothing worked. Please fix this bug in the next update. I really need access to my account. Thank you.",1,0,2025-06-03,Bank of Abyssinia,Google Play
2,exceptional,5,0,2025-06-03,Bank of Abyssinia,Google Play
3,BoA Mobile good bank,5,0,2025-06-02,Bank of Abyssinia,Google Play
4,this is worest app 24/7 loading,1,0,2025-06-01,Bank of Abyssinia,Google Play


In [5]:
# check types 
df.dtypes

review_text    object
rating          int64
upvote          int64
date           object
bank_name      object
source         object
dtype: object

In [6]:
df.describe()

Unnamed: 0,rating,upvote
count,1044.0,1044.0
mean,3.063218,10.039272
std,1.860893,77.112271
min,1.0,0.0
25%,1.0,0.0
50%,3.0,1.0
75%,5.0,2.0
max,5.0,1810.0


In [7]:
# check for null values in the DataFrame
df.isnull().sum()

review_text    0
rating         0
upvote         0
date           0
bank_name      0
source         0
dtype: int64

### Remove Duplicate Entries

When I check for duplicate entries in general, I find 8 duplicate rows. However, when I define duplicates based on both review_text and rating, the number increases to 170 rows. This indicates that the most influential factors in identifying duplicates are the review text and the rating.

In [12]:
# Check for duplicated data
dublicated_data = df.duplicated(subset=['review_text', 'rating'], keep=False).sum()
print(f'data duplicated: {dublicated_data}')
df[df.duplicated()]

data duplicated: 0


Unnamed: 0,review_text,rating,upvote,date,bank_name,source


In [11]:
# remove duplicated data
df.drop_duplicates(subset=['review_text', 'rating'], keep='first', inplace=True)

In [13]:
# change date format from object to 'YYYY-MM-DD' format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df.dtypes

review_text            object
rating                  int64
upvote                  int64
date           datetime64[ns]
bank_name              object
source                 object
dtype: object

In [14]:
df.shape

(911, 6)