In [4]:
# Cell 2: Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

# Make plots a bit prettier
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 4)


In [5]:
# Cell 3: Download VADER lexicon for sentiment analysis (only first time)
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...


In [13]:
import pandas as pd

# Load preprocessed CSV
df = pd.read_csv("data/processed/reviews_processed.csv")

# Quick check
print(df.head())
print(df.info())


                              review_id  \
0  fc67d12c-92e2-45aa-a9e0-011f58a583bc   
1  11306fb9-5571-4950-8d32-604c5402242f   
2  809c46d2-730e-446a-9061-2a45e978ad9d   
3  f28a3a3c-eb94-4aab-88d2-89bcecebcc7b   
4  4ed89e8c-16dc-4763-94ca-04d05cf799a5   

                                         review_text  rating review_date  \
0                                               goof       5  2025-11-28   
1                                              good!       5  2025-11-28   
2                                           good jop       5  2025-11-27   
3                   bad exprience...it is so crushed       1  2025-11-27   
4  not user friendly at all it requires a huge co...       1  2025-11-26   

   review_year  review_month  bank_code       bank_name            user_name  \
0         2025            11  Abyssinia  Abyssinia Bank  Hailegebrail Tegegn   
1         2025            11  Abyssinia  Abyssinia Bank            Tsegay ab   
2         2025            11  Abyssinia  Aby

In [32]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

df['sentiment_score'] = df['review_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['sentiment'] = df['sentiment_score'].apply(lambda x: 'POSITIVE' if x > 0 else 'NEGATIVE')
df['sentiment_numeric'] = df['sentiment'].map({'POSITIVE': 1, 'NEGATIVE': -1})


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Step 2: Sentiment Analysis

Option A: Using HuggingFace Transformers

In [33]:
# ---------- SENTIMENT ANALYSIS ----------
sia = SentimentIntensityAnalyzer()

# Compute sentiment score and label
df['sentiment_score'] = df['review_text'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
df['sentiment'] = df['sentiment_score'].apply(lambda x: 'POSITIVE' if x > 0 else 'NEGATIVE')
df['sentiment_numeric'] = df['sentiment'].map({'POSITIVE': 1, 'NEGATIVE': -1})

# Aggregate sentiment by bank and rating (for checking KPIs)
bank_sentiment = df.groupby('bank_name')['sentiment_numeric'].mean()
print("Average sentiment per bank:\n", bank_sentiment)

rating_sentiment = df.groupby('rating')['sentiment_numeric'].mean()
print("\nAverage sentiment per rating:\n", rating_sentiment)

Average sentiment per bank:
 bank_name
Abyssinia Bank                -0.065
Commercial Bank of Ethiopia    0.155
Name: sentiment_numeric, dtype: float64

Average sentiment per rating:
 rating
1   -0.643979
2   -0.259259
3    0.022222
4    0.269841
5    0.312236
Name: sentiment_numeric, dtype: float64


In [34]:
print(df[['review_text','sentiment','sentiment_score']].head(10))


                                         review_text sentiment  \
0                                               goof  NEGATIVE   
1                                              good!  POSITIVE   
2                                           good jop  POSITIVE   
3                   bad exprience...it is so crushed  NEGATIVE   
4  not user friendly at all it requires a huge co...  NEGATIVE   
5                                         ምንም የማይ ሰራ  NEGATIVE   
6                                          very good  POSITIVE   
7           most of the time is not working properly  NEGATIVE   
8                                       good service  POSITIVE   
9                                     not use for me  NEGATIVE   

   sentiment_score  
0           0.0000  
1           0.4926  
2           0.4404  
3          -0.7973  
4          -0.4268  
5           0.0000  
6           0.4927  
7           0.0000  
8           0.4404  
9           0.0000  


In [35]:
# Average sentiment score per bank
bank_sentiment = df.groupby('bank_name')['sentiment_score'].mean()
print(bank_sentiment)

# Average sentiment per rating
rating_sentiment = df.groupby('rating')['sentiment_score'].mean()
print(rating_sentiment)


bank_name
Abyssinia Bank                 0.136077
Commercial Bank of Ethiopia    0.263350
Name: sentiment_score, dtype: float64
rating
1   -0.160578
2    0.066426
3    0.142180
4    0.316687
5    0.342402
Name: sentiment_score, dtype: float64


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract top keywords per bank
themes = {}
for bank in df['bank_name'].unique():
    bank_reviews = df[df['bank_name'] == bank]['review_text']
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=50)
    X = vectorizer.fit_transform(bank_reviews)
    keywords = vectorizer.get_feature_names_out()
    themes[bank] = keywords

# Check keywords per bank
for bank, kws in themes.items():
    print(f"\n{bank} keywords:")
    print(kws)



Abyssinia Bank keywords:
['app' 'application' 'apps' 'bad' 'bank' 'banking' 'banking app' 'best'
 'best app' 'better' 'boa' 'crashes' 'developer' 'developer options'
 'doesn' 'doesn work' 'don' 'easy' 'ethiopia' 'experience' 'fast' 'fix'
 'good' 'good app' 'great' 'just' 'like' 'loading' 'long' 'make' 'mobile'
 'mobile banking' 'money' 'need' 'nice' 'open' 'options' 'phone' 'problem'
 'service' 'slow' 'time' 'times' 'transfer' 'update' 'use' 'work'
 'working' 'worst' 'worst app']

Commercial Bank of Ethiopia keywords:
['account' 'app' 'application' 'apps' 'bank' 'banking' 'best' 'best app'
 'better' 'branch' 'cbe' 'does' 'doesn' 'easy' 'ethiopia' 'excellent'
 'fast' 'fix' 'friendly' 'good' 'good app' 'great' 'history' 'like' 'love'
 'mobile' 'mobile banking' 'money' 'new' 'nice' 'ok' 'option' 'pin'
 'problem' 'send' 'service' 'telebirr' 'time' 'transaction' 'transfer'
 'txn' 'update' 'use' 'useful' 'user' 'work' 'working' 'works' 'በጣም' 'ነው']


In [37]:
# Example theme mapping (you can expand)
theme_mapping = {
    "Account Access Issues": ["login error", "password", "account blocked"],
    "Transaction Performance": ["slow transfer", "payment failed", "delay"],
    "UI & Experience": ["easy to use", "interface", "navigation"],
    "Customer Support": ["help desk", "support", "call center"],
}

# Assign themes to reviews
def assign_theme(review):
    assigned = []
    for theme, keywords in theme_mapping.items():
        for kw in keywords:
            if kw in review.lower():
                assigned.append(theme)
                break
    return assigned if assigned else ["Other"]

df['themes'] = df['review_text'].apply(assign_theme)


In [38]:
df.to_csv("bank_reviews_analysis.csv", index=False)
