In [None]:
pip install datasets

In [None]:
#INSTALL REQUIRED LIBRARIES
from datasets import load_dataset
import pandas as pd

#LOADING THE DATASET FOR UNSUPERVISED ASPECT BASED ANALYSIS
dataset = load_dataset("amazon_polarity", split="train[:5000]")
df_filtered = pd.DataFrame(dataset)[["content"]]
df_filtered.rename(columns={"content": "review"}, inplace=True)
#SAVING THE DATASET SO THAT IT COULD BE USED IN ANOTHER FILE FOR DISTILBERT
df_filtered.to_csv("test_5000.csv", index=False)
print("Saved to 'test_only_5000.csv'")


Saved to 'test_only_5000.csv'


In [None]:
df_filtered.head()

Unnamed: 0,review
0,This sound track was beautiful! It paints the ...
1,I'm reading a lot of reviews saying that this ...
2,This soundtrack is my favorite music of all ti...
3,I truly like this soundtrack and I enjoy video...
4,"If you've played the game, you know how divine..."


In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [None]:
#INSTALL REQWUIRED LIBRARIES
import pandas as pd
import spacy
from transformers import pipeline
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import opinion_lexicon
import nltk
nltk.download('opinion_lexicon')
nlp = spacy.load("en_core_web_sm")
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0)

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


**ASPECT EXTRACTION AND CLASSIFICATION **

In [None]:
#This function uses dependency parsing to identify opinion-bearing words and link them to relevant nouns through graph-based shortest paths. It filters out uninformative parts of speech and selects the most meaningful aspect–opinion pairs from each review.

detected_opinion_log = []
def extract_aspects_with_opinions(text, max_pairs_per_review=5):
    doc = nlp(text)
    graph = nx.Graph()
    for token in doc:
        for child in token.children:
            graph.add_edge(token.text, child.text)
    pos_opinion_words = set(opinion_lexicon.positive())
    neg_opinion_words = set(opinion_lexicon.negative())
    all_opinion_lex = pos_opinion_words.union(neg_opinion_words)
    aspect_opinion_pairs = set()
    for token in doc:
        lemma = token.lemma_.lower()
        pos = token.pos_
        if pos in {"DET", "NUM", "PRON", "PROPN", "INTJ", "CCONJ", "SYM", "X"}:
            continue
        is_opinion = (pos in {"ADJ", "ADV"} or (pos == "VERB"))
        if is_opinion:
            for noun in [t for t in doc if t.pos_ == "NOUN"]:
                try:
                    path = nx.shortest_path(graph, source=token.text, target=noun.text)
                    if 1 < len(path) <= 4:
                        aspect_opinion_pairs.add((noun.lemma_.lower(), token.lemma_.lower()))
                except:
                    continue
    if not aspect_opinion_pairs:
        for noun in [t for t in doc if t.pos_ == "NOUN"]:
            aspect_opinion_pairs.add((noun.lemma_.lower(), "(no_opinion)"))

    return list(aspect_opinion_pairs)[:max_pairs_per_review]
# Given a review and its aspect–opinion pairs, this function builds contextual snippets and classifies their sentiment using a pre-trained DistilBERT model. It processes inputs in batches for efficiency.

def get_aspect_sentiments_batched(text, aspect_opinion_pairs):
    contexts = build_contexts(text, aspect_opinion_pairs)
    sentiments = batch_sentiment_analysis(contexts)

    aspect_sentiments = []
    for (aspect, _, _), result in zip(contexts, sentiments):
        sentiment = result.get("label", "Neutral").capitalize()
        aspect_sentiments.append((aspect, sentiment))
    return aspect_sentiments
#This method reconstructs the dependency graph and creates a local context string by tracing the shortest path between each aspect and its corresponding opinion word. These context paths are later used as input for sentiment classification.
def build_contexts(text, aspect_opinion_pairs):
    doc = nlp(text)
    graph = nx.Graph()
    for token in doc:
        for child in token.children:
            graph.add_edge(token.text, child.text)

    contexts = []
    for aspect, opinion in aspect_opinion_pairs:
        if opinion != "(no_opinion)":
            try:
                path = nx.shortest_path(graph, source=aspect, target=opinion)
                context = " ".join(path)
            except:
                context = aspect
        else:
            context = aspect
        contexts.append((aspect, opinion, context))
    return contexts
#This function performs sentiment prediction on context phrases in batches to optimize runtime. If prediction fails for a batch, it assigns a default neutral sentiment to maintain robustness.

def batch_sentiment_analysis(contexts, batch_size=64):
    all_contexts = [c for (_, _, c) in contexts]
    results = []
    for i in range(0, len(all_contexts), batch_size):
        batch = all_contexts[i:i+batch_size]
        try:
            batch_results = sentiment_model(batch)
        except:
            batch_results = [{"label": "NEUTRAL"}] * len(batch)
        results.extend(batch_results)
    return results
aspect_data = []
for _, row in tqdm(df_filtered.iterrows(), total=len(df_filtered)):
    review = row["review"]
    aspect_opinion_pairs = extract_aspects_with_opinions(review)
    sentiments = get_aspect_sentiments_batched(review, aspect_opinion_pairs)
    for ((aspect, opinion), (_, sentiment)) in zip(aspect_opinion_pairs, sentiments):
        aspect_data.append({
            "Review": review,
            "Aspect": aspect,
            "Opinion_Word": opinion,
            "Sentiment": sentiment
        })

aspect_df = pd.DataFrame(aspect_data)
aspect_df.to_csv("amazon_aspect_based_sentiment_distilbert.csv", index=False)
print("\n Enhanced aspect-based sentiment dataset saved as 'amazon_aspect_based_sentiment_distilbert.csv'")

100%|██████████| 5000/5000 [06:11<00:00, 13.46it/s]



 Enhanced aspect-based sentiment dataset saved as 'amazon_aspect_based_sentiment_distilbert.csv'


In [None]:
aspect_df.head()

Unnamed: 0,Review,Aspect,Opinion_Word,Sentiment
0,This sound track was beautiful! It paints the ...,grate,soulful,Positive
1,This sound track was beautiful! It paints the ...,step,fresher,Positive
2,This sound track was beautiful! It paints the ...,guitar,take,Positive
3,This sound track was beautiful! It paints the ...,guitar,soulful,Positive
4,This sound track was beautiful! It paints the ...,game,paint,Positive


In [None]:
# groups aspect-based sentiment results by review, combining all detected aspects, opinion words, and their sentiments into single entries. The merged output is saved as a CSV for easier comparison and analysis.
import pandas as pd
aspect_df = pd.read_csv("/content/amazon_aspect_based_sentiment_distilbert.csv")
merged_df = aspect_df.groupby('Review').agg({
    'Aspect': lambda x: ", ".join(x),
    'Opinion_Word': lambda x: ", ".join(x),
    'Sentiment': lambda x: ", ".join(x)
}).reset_index()
merged_df.to_csv("merged_aspect_sentiment_output.csv", index=False)

print("\n Merged output saved as 'merged_aspect_sentiment_output.csv'")
print(merged_df.head())


 Merged output saved as 'merged_aspect_sentiment_output.csv'
                                              Review  \
0  "A World Within a World" says it all. This is ...   
1  "Castle" is a wonderful book for children who ...   
2  "Charles Manson in Nuell Emmons's Words" anyon...   
3  "Dumb Witness" is a slower moving, more relaxi...   
4  "Easy A" explains it best. If you can't relate...   

                                   Aspect  \
0     fan, fan, silverchair, lyric, world   
1    child, chivalry, span, re, attention   
2          word, lingo, word, lingo, word   
3  type, tragedy, tension, drama, tragedy   
4           book, book, book, robot, book   

                           Opinion_Word  \
0          come, make, come, happy, say   
1      have, affirm, have, affirm, have   
2       interview, see, know, know, see   
3     relaxing, just, just, well, write   
4  relate, explain, well, relate, great   

                                          Sentiment  
0  Positive, Posi

In [None]:
#merges sentence-level and aspect-based sentiment outputs on the same review text to enable side-by-side comparison. The final CSV highlights overall sentiment alongside specific aspect sentiments for each review.
import pandas as pd
aspect_df = pd.read_csv("/content/merged_aspect_sentiment_output.csv")
sentence_df = pd.read_csv("/content/amazon_test_predictions (1).csv")
aspect_df.rename(columns={"Review": "Sentence"}, inplace=True)
sentence_df.rename(columns={"review": "Sentence"}, inplace=True)
merged_df = pd.merge(sentence_df, aspect_df, on="Sentence", how="inner")
final_comparison_df = merged_df[[
    "Sentence",
    "Predicted_Sentiment",
    "Aspect",
    "Sentiment"
]]
final_comparison_df.rename(columns={
    "Predicted_Sentiment": "Sentence_Level_Sentiment",
    "Aspect": "Aspects",
    "Sentiment": "Aspect_Sentiments"
}, inplace=True)
final_comparison_df.to_csv("sentence_vs_aspect_sentiment_comparison_final.csv", index=False)
print("\n Final comparison saved as 'sentence_vs_aspect_sentiment_comparison.csv'")
print(final_comparison_df.head())



 Final comparison saved as 'sentence_vs_aspect_sentiment_comparison.csv'
                                            Sentence Sentence_Level_Sentiment  \
0  This sound track was beautiful! It paints the ...                 Positive   
1  I'm reading a lot of reviews saying that this ...                 Positive   
2  This soundtrack is my favorite music of all ti...                 Positive   
3  I truly like this soundtrack and I enjoy video...                 Positive   
4  If you've played the game, you know how divine...                 Positive   

                                  Aspects  \
0       grate, step, guitar, guitar, game   
1      money, tag, review, soundtrack, cd   
2  time, fate, remeniscent, sadness, work   
3    death, game, music, soundtrack, game   
4          song, paper, game, doubt, game   

                                  Aspect_Sentiments  
0  Positive, Positive, Positive, Positive, Positive  
1  Negative, Positive, Positive, Positive, Positive  
2  Neg

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_comparison_df.rename(columns={
