In [1]:
#import dependencies
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import numpy as np
import re
import os
import spacy
from transformers import pipeline
import torch

from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from pathlib import Path

#sentiment analysis
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#keyword Extraction
from sklearn.feature_extraction.text import TfidfVectorizer


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [3]:
# CONFIGURATION & SETUP
# ---------------------------
nlp = spacy.load("en_core_web_sm")
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
DATA_PATH = "../data/clean_reviews.csv"  # Expected columns: ['review_id', 'bank', 'rating', 'review_text']
OUTPUT_PATH = "../data/task2_results.csv"

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [4]:
# ---------------------------
# TEXT PREPROCESSING FUNCTION
# ---------------------------
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text.lower())
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

In [5]:
# ---------------------------
# SENTIMENT ANALYSIS
# ---------------------------
def compute_sentiment(texts):
    results = []
    for text in texts:
        try:
            result = sentiment_model(text[:512])[0]  # Truncate long reviews
            score = result['score'] if result['label'] == 'POSITIVE' else -result['score']
            label = result['label'].lower()
        except Exception:
            score, label = 0, 'neutral'
        results.append((label, score))
    return zip(*results)

In [6]:
# ---------------------------
# TF-IDF KEYWORD EXTRACTION
# ---------------------------
def extract_keywords(texts, top_n=10):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()

    keywords = []
    for i in range(tfidf_matrix.shape[0]):
        row = tfidf_matrix.getrow(i).toarray().flatten()
        top_indices = row.argsort()[-top_n:][::-1]
        top_features = [feature_names[idx] for idx in top_indices if row[idx] > 0]
        keywords.append(top_features)
    return keywords

In [7]:
# ---------------------------
# THEME CLUSTERING (RULE-BASED)
# ---------------------------
def assign_themes(keywords):
    theme_dict = {
        "Account Access Issues": ["login", "password", "authentication", "signin", "access"],
        "Transaction Performance": ["transfer", "delay", "transaction", "payment", "fail"],
        "User Interface & Experience": ["interface", "design", "ui", "navigation", "friendly"],
        "Customer Support": ["support", "help", "service", "response", "agent"],
        "Feature Requests": ["feature", "add", "request", "need", "option"]
    }

    def get_themes(kws):
        assigned = set()
        for kw in kws:
            for theme, triggers in theme_dict.items():
                if any(trigger in kw for trigger in triggers):
                    assigned.add(theme)
        return list(assigned) if assigned else ["Other"]

    return [get_themes(kws) for kws in keywords]

In [None]:
# ---------------------------
# MAIN ANALYSIS PIPELINE
# ---------------------------
def main():
    df = pd.read_csv(DATA_PATH)
    df = df.dropna(subset=["review"])

    # Preprocessing
    df["clean_text"] = df["review"].apply(preprocess_text)

    # Sentiment
    sentiments = df["review"].apply(compute_sentiment)
    df["sentiment_label"] = [label for label, _ in sentiments]
    df["sentiment_score"] = [score for _, score in sentiments]

    # Keywords and Themes
    df["keywords"] = extract_keywords(df["clean_text"])
    df["themes"] = df["keywords"].apply(assign_themes)

    # Save
    df_out = df[["review", "bank", "rating", "sentiment_label", "sentiment_score", "themes"]]
    df_out.to_csv(OUTPUT_PATH, index=False)
    print(f"✅ Task 2 complete. Output saved to: {OUTPUT_PATH}")

if __name__ == "__main__":
    main()