## 1. Data Loading and Overview

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [2]:
# Load analyzed reviews data
df = pd.read_csv('../data/processed/analyzed_reviews.csv')

print(f"Dataset loaded: {len(df)} reviews")
print(f"Columns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")

Dataset loaded: 827 reviews
Columns: ['review', 'rating', 'date', 'bank', 'source', 'sentiment_score', 'sentiment_label', 'pos_score', 'neu_score', 'neg_score', 'rating_adjusted', 'preprocessed_text', 'themes', 'themes_str']

Data types:
review                object
rating                 int64
date                  object
bank                  object
source                object
sentiment_score      float64
sentiment_label       object
pos_score            float64
neu_score            float64
neg_score            float64
rating_adjusted         bool
preprocessed_text     object
themes                object
themes_str            object
dtype: object


In [3]:
# Data completeness check
print("=" * 70)
print("DATA COMPLETENESS CHECK")
print("=" * 70)

print(f"\nTotal reviews: {len(df)}")
print(f"\nReviews per bank:")
print(df['bank'].value_counts().sort_index())

print(f"\nMissing values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

print(f"\n✓ Data verification complete")

DATA COMPLETENESS CHECK

Total reviews: 827

Reviews per bank:
bank
Bank of Abyssinia              290
Commercial Bank of Ethiopia    227
Dashen Bank                    310
Name: count, dtype: int64

Missing values:
                   Missing Count  Percentage
preprocessed_text              2        0.24
themes_str                   126       15.24

✓ Data verification complete


In [4]:
# Statistical summary
print("=" * 70)
print("STATISTICAL SUMMARY")
print("=" * 70)

print(f"\n1. Rating Statistics:")
print(df['rating'].describe())

print(f"\n2. Rating Distribution:")
print(df['rating'].value_counts().sort_index())

print(f"\n3. Sentiment Distribution:")
print(df['sentiment_label'].value_counts())

print(f"\n4. Date Range:")
print(f"   Earliest: {df['date'].min()}")
print(f"   Latest: {df['date'].max()}")

print(f"\n5. Average Rating by Bank:")
bank_ratings = df.groupby('bank')['rating'].agg(['mean', 'count']).round(2)
print(bank_ratings)

print(f"\n6. Average Sentiment Score by Bank:")
bank_sentiment = df.groupby('bank')['sentiment_score'].agg(['mean', 'std']).round(4)
print(bank_sentiment)

STATISTICAL SUMMARY

1. Rating Statistics:
count    827.000000
mean       3.432890
std        1.788118
min        1.000000
25%        1.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: rating, dtype: float64

2. Rating Distribution:
rating
1    251
2     43
3     53
4     57
5    423
Name: count, dtype: int64

3. Sentiment Distribution:
sentiment_label
Positive    510
Negative    304
Neutral      13
Name: count, dtype: int64

4. Date Range:
   Earliest: 2024-08-01
   Latest: 2025-11-26

5. Average Rating by Bank:
                             mean  count
bank                                    
Bank of Abyssinia            2.64    290
Commercial Bank of Ethiopia  3.77    227
Dashen Bank                  3.93    310

6. Average Sentiment Score by Bank:
                               mean     std
bank                                       
Bank of Abyssinia           -0.0047  0.4758
Commercial Bank of Ethiopia  0.2597  0.4310
Dashen Bank                  0.3698  0.