In [1]:
import os
import sys

project_root = os.path.abspath("..")  # or path to repo root
sys.path.append(project_root)

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from google import genai
import json
import time
from tqdm import tqdm
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configure visualization settings
plt.style.use('ggplot')
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

# Display settings
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
api_key = "AIzaSyA1YcD7z2mnK-43bV7zxD1myC684_oRybA"
client = genai.Client(api_key=api_key)

In [5]:
file_path = "../../data/processed/absa_with_label.csv"

df = pd.read_csv(file_path)

df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,review_word_count,review_unique_word_count,unique_ratio,cluster,gameplay,graphics,story,performance,audio,controls,price,multiplayer
0,0,1501,15100,Assassin's Creed,Good game but there are some bugs Ultimately i...,1,1,28,26,0.93,60,1,1,0,-1,0,0,0,0
1,1,2586,264280,99 Levels To Hell,Do you want a nicely crafted rogue like game w...,-1,1,36,35,0.97,103,-1,0,0,0,0,-1,0,0
2,2,2653,262240,Suguri,SUGURI 7 Stages 1 Life No Continues No Power U...,1,1,26,26,1.0,106,0,0,0,0,0,0,0,0
3,3,1055,318530,Wings of Vi,Exigence of high skill controls problems frust...,-1,1,82,61,0.74,42,-1,0,0,0,0,-1,0,0
4,4,705,4850,Cossacks: Back to War,ballsacks back to war is a great game! you can...,-1,1,24,23,0.96,28,1,0,0,0,0,0,0,0


In [16]:
ASPECTS = [
    "gameplay",
    "graphics",
    "story",
    "performance",
    "audio",
    "controls",
    "price",
    "multiplayer"
]

MODEL_NAME = "models/gemini-2.5-flash" 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['review_text'])

k = 200
km = KMeans(n_clusters=k, random_state=42)
df['cluster'] = km.fit_predict(X)

sampled = (
    df.groupby("cluster")
      .apply(lambda x: x.sample(min(25, len(x)), random_state=42))
      .reset_index(drop=True)
)
sampled = sampled.sample(5000, random_state=42)


(5000, 9)
(5000, 9)


In [20]:
sampled.to_csv("../data/processed/absa_train_no_label.csv")

In [5]:
file_path = "../data/processed/absa_train_no_label.csv"

df = pd.read_csv(file_path)

In [6]:
ASPECTS = [
    "gameplay","graphics","story","performance",
    "audio","controls","price","multiplayer"
]

prompt_template = f"""
You are an Aspect-Based Sentiment Analysis model for Steam game reviews.

Given a review, analyze sentiment for each aspect:

{ASPECTS}

For each aspect, choose ONLY one of:
- 1  (positive)
- 0  (neutral / not mentioned)
- -1 (negative)

Return JSON ONLY in this EXACT format:

{{
  "gameplay": 0,
  "graphics": 0,
  "story": 0,
  "performance": 0,
  "audio": 0,
  "controls": 0,
  "price": 0,
  "multiplayer": 0
}}

Review:
"""

In [7]:
def call_llm(review_text, retries=5, delay=2):
    prompt = prompt_template + review_text

    for attempt in range(retries):
        try:
            res = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=prompt,
                config={"response_mime_type": "application/json"}
            )
            data = json.loads(res.text)
            return data

        except Exception as e:
            if attempt == retries - 1:
                print("FAILED:", review_text[:80], "ERR:", e)
                return {a: 0 for a in ASPECTS}  # fallback
            time.sleep(delay)

def label_dataframe(df, text_column="review_text", max_workers=10):
    results = []
    total = len(df)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(call_llm, row[text_column]): i
            for i, row in df.iterrows()
        }

        # tqdm progress bar
        for future in tqdm(as_completed(futures), total=total, desc="Labelling"):
            idx = futures[future]
            results.append((idx, future.result()))

    # restore order
    results = sorted(results, key=lambda x: x[0])
    labels = [item[1] for item in results]

    # expand into new columns
    for aspect in ASPECTS:
        df[aspect] = [entry.get(aspect, 0) for entry in labels]

    return df

In [8]:
df_labeled = label_dataframe(df, "review_text", max_workers=15)
df_labeled.head()

Labelling: 100%|██████████| 5000/5000 [20:58<00:00,  3.97it/s]


Unnamed: 0.1,Unnamed: 0,app_id,app_name,review_text,review_score,review_votes,review_word_count,review_unique_word_count,unique_ratio,cluster,gameplay,graphics,story,performance,audio,controls,price,multiplayer
0,1501,15100,Assassin's Creed,Good game but there are some bugs Ultimately i...,1,1,28,26,0.93,60,1,1,0,-1,0,0,0,0
1,2586,264280,99 Levels To Hell,Do you want a nicely crafted rogue like game w...,-1,1,36,35,0.97,103,-1,0,0,0,0,-1,0,0
2,2653,262240,Suguri,SUGURI 7 Stages 1 Life No Continues No Power U...,1,1,26,26,1.0,106,0,0,0,0,0,0,0,0
3,1055,318530,Wings of Vi,Exigence of high skill controls problems frust...,-1,1,82,61,0.74,42,-1,0,0,0,0,-1,0,0
4,705,4850,Cossacks: Back to War,ballsacks back to war is a great game! you can...,-1,1,24,23,0.96,28,1,0,0,0,0,0,0,0


In [9]:
df_labeled.drop(columns=["review_score","review_votes","Unnamed: 0","app_id","app_name","review_word_count","review_unique_word_count","unique_ratio","cluster"])

Unnamed: 0,review_text,gameplay,graphics,story,performance,audio,controls,price,multiplayer
0,Good game but there are some bugs Ultimately i...,1,1,0,-1,0,0,0,0
1,Do you want a nicely crafted rogue like game w...,-1,0,0,0,0,-1,0,0
2,SUGURI 7 Stages 1 Life No Continues No Power U...,0,0,0,0,0,0,0,0
3,Exigence of high skill controls problems frust...,-1,0,0,0,0,-1,0,0
4,ballsacks back to war is a great game! you can...,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
4995,This game was reviewed using a code sent by th...,0,0,0,0,0,0,0,0
4996,If you like FTL you will love this game Fast p...,1,1,0,0,0,0,0,0
4997,Game sucks only buy for trading cards you prob...,-1,0,0,0,0,0,0,0
4998,when i was young my grandpa was an heavy smoke...,-1,0,0,0,0,0,0,0


In [10]:
df_labeled.to_csv("../data/processed/absa_with_label.csv")