In [1]:
import pandas as pd
import os

In [2]:
# Get the absolute path to the data directory
abs_dir = os.path.abspath(os.path.dirname(os.getcwd()))

In [3]:
df = pd.read_csv(os.path.join(abs_dir, 'data/dashen_reviews_20250608_162802.csv'))

In [4]:
pd.set_option('display.max_colwidth', None)

df.head(5)

Unnamed: 0,review_text,rating,upvote,date,bank_name,source
0,I like this mobile banking app very much. Overall user interface and navigation is awesome. But it lacks instant response when someone deposit or withdraw money.,2,0,2025-06-07,Dashen Bank,Google Play
1,love,3,0,2025-06-06,Dashen Bank,Google Play
2,መቸሸጠ,5,0,2025-06-03,Dashen Bank,Google Play
3,wow,5,0,2025-06-03,Dashen Bank,Google Play
4,gadaa,5,0,2025-06-01,Dashen Bank,Google Play


In [5]:
# check types 
df.dtypes

review_text    object
rating          int64
upvote          int64
date           object
bank_name      object
source         object
dtype: object

In [6]:
df.describe()

Unnamed: 0,rating,upvote
count,449.0,449.0
mean,4.469933,7.160356
std,1.206238,24.024655
min,1.0,0.0
25%,5.0,2.0
50%,5.0,2.0
75%,5.0,4.0
max,5.0,226.0


In [7]:
# check for null values in the DataFrame
df.isnull().sum()

review_text    0
rating         0
upvote         0
date           0
bank_name      0
source         0
dtype: int64

In [11]:
# Check for duplicated data
dublicated_data = df.duplicated(subset=['review_text', 'rating'], keep=False).sum()
print(f'data duplicated: {dublicated_data}')
df[df.duplicated()]

data duplicated: 0


Unnamed: 0,review_text,rating,upvote,date,bank_name,source


### Remove Duplicate Entries

When I check for duplicate entries in general, I find 2 duplicate rows. However, when I define duplicates based on both review_text and rating, the number increases to 57 rows. This indicates that the most influential factors in identifying duplicates are the review text and the rating.

In [10]:
# remove duplicated data
df.drop_duplicates(subset=['review_text', 'rating'], keep='first', inplace=True)

In [12]:
# change date format from object to 'YYYY-MM-DD' format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df.dtypes

review_text            object
rating                  int64
upvote                  int64
date           datetime64[ns]
bank_name              object
source                 object
dtype: object

In [14]:
df.shape

(411, 6)