In [None]:
!rm -rf /content/Steam-Review-NLP-Pipeline
!git clone https://github.com/chankiel/Steam-Review-NLP-Pipeline.git /content/Steam-Review-NLP-Pipeline

Cloning into '/content/Steam-Review-NLP-Pipeline'...
remote: Enumerating objects: 150, done.[K
remote: Counting objects: 100% (150/150), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 150 (delta 56), reused 116 (delta 25), pack-reused 0 (from 0)[K
Receiving objects: 100% (150/150), 1.59 MiB | 11.82 MiB/s, done.
Resolving deltas: 100% (56/56), done.


In [None]:
!pip install -q transformers sentencepiece


In [1]:
import gdown
import sys, os
import pandas as pd
import torch

In [2]:
# https://drive.google.com/file/d/1yA3RR861M-rWCvVTZ-i_HhvhRNYn2xpF/view?usp=sharing
url = "https://drive.google.com/uc?id=1yA3RR861M-rWCvVTZ-i_HhvhRNYn2xpF"
output = "/content/summarizer.csv"

gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1yA3RR861M-rWCvVTZ-i_HhvhRNYn2xpF
To: /content/summarizer.csv
100%|██████████| 45.9M/45.9M [00:00<00:00, 154MB/s]


'/content/summarizer.csv'

In [3]:
src_root = "/content/Steam-Review-NLP-Pipeline/src"
sys.path.insert(0, src_root)
preprocess_dir = "/content/Steam-Review-NLP-Pipeline/src/preprocess"


In [5]:
INPUT_CSV = "/content/summarizer.csv"
df = pd.read_csv(INPUT_CSV)
required_cols = {"app_id", "app_name", "review_text"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing columns: {missing}")


In [6]:
df["app_name"] = df["app_name"].fillna("Unknown")

MAX_REVIEWS_PER_GAME = 20  # biar gak kepanjangan

df_small = df.groupby(["app_id", "app_name"]).head(MAX_REVIEWS_PER_GAME)

grouped = (
    df_small.groupby(["app_id", "app_name"])["review_text"]
      .apply(lambda x: " ||| ".join(x.astype(str)))
      .reset_index()
      .rename(columns={"review_text": "all_reviews"})
)
final_grouped = grouped[["app_id", "app_name", "all_reviews"]].copy()
final_grouped.to_csv("/content/grouped_summary.csv", index=False)

In [7]:
def build_compact_reviews(all_reviews: str, tokenizer, max_tokens: int) -> str:
    """
    all_reviews: string "rev1 ||| rev2 ||| rev3 ..."
    We:
    - split into individual reviews
    - walk through them and keep adding until token budget is reached
    - this way multiple reviews are represented, not just one long one
    """
    reviews = [r.strip() for r in all_reviews.split(" ||| ") if r.strip()]
    compact = []
    used_tokens = 0

    for r in reviews:
        # estimate length
        t_ids = tokenizer.encode(r, add_special_tokens=False)
        t_len = len(t_ids)

        if used_tokens + t_len > max_tokens:
            break

        compact.append(r)
        used_tokens += t_len

    if not compact:
        return all_reviews  # fallback

    # Join selected reviews with a separator, but not too verbose
    return " [REVIEW] ".join(compact)


In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm

# ==========================
# 1. CONFIG
# ==========================
# smaller model = faster (you can switch back to flan-t5-base later)
MODEL_NAME = "google/flan-t5-small"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
model.eval()

# max tokens for input (we truncate if too long)
MAX_INPUT_TOKENS = 3000   # smaller = faster
MAX_OUTPUT_TOKENS = 300   # shorter summary = faster

# ==========================
# 2. CHUNK SELECTION (PARTIAL RUN)
# ==========================
# You have 8067 games. Process only a slice per run.
# Example: first 500 rows.
START_IDX = 0
END_IDX = 50   # change to 1000, 1500, etc. for next runs

subset = grouped.iloc[START_IDX:END_IDX].copy()
print(f"Summarizing rows [{START_IDX}:{END_IDX}) out of {len(grouped)}")

# ==========================
# 3. SIMPLE ONE-PASS SUMMARIZER
# ==========================
def summarize_game_reviews(all_reviews: str, app_name: str) -> str:
    """
    One-pass summarization:
    - Truncate input if too long
    - No multi-chunk, just direct prompting to keep it fast
    """
    compact_reviews = build_compact_reviews(
        all_reviews,
        tokenizer,
        max_tokens=MAX_INPUT_TOKENS - 64,  # leave margin for instructions
    )

    prompt = (
        f"You are summarizing community opinions about the game '{app_name}'. "
        f"Below is a collection of user reviews. "
        f"Your goal is to write a balanced, neutral summary that captures the common themes across multiple reviews.\n\n"

        f"Instructions:\n"
        f"- Focus on overall player sentiment, not a single review.\n"
        f"- Highlight gameplay, controls, pacing, difficulty, graphics, performance, audio, and overall enjoyment *only if mentioned*.\n"
        f"- If the reviews contradict each other, acknowledge both sides briefly.\n"
        f"- If reviews are very short or low-quality, provide the most reasonable interpretation.\n"
        f"- Do NOT copy or paraphrase any single review.\n"
        f"- Do NOT include slang, insults, or emotional rants.\n"
        f"- Do NOT invent details.\n"
        f"- Keep the tone calm, factual, and third-person.\n"
        f"- Write 2–4 sentences.\n\n"

        f"User Reviews:\n{all_reviews}\n\n"
        f"Summary:"
    )

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_TOKENS,
    ).to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=MAX_OUTPUT_TOKENS,
            min_length=24,
            num_beams=2,        # fewer beams = faster
            do_sample=False,    # deterministic → less hallucination
            no_repeat_ngram_size=3,
        )

    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary.strip()

# ==========================
# 4. RUN OVER SUBSET
# ==========================
summaries = []

for _, row in tqdm(subset.iterrows(), total=len(subset), desc="Summarizing games"):
    summary = summarize_game_reviews(row["all_reviews"], row["app_name"])
    summaries.append(summary)

subset["summary"] = summaries

# ==========================
# 5. SAVE PARTIAL RESULT
# ==========================
OUTPUT_CSV = f"/content/steam_game_summaries_prompting_{START_IDX}_{END_IDX}.csv"
final_subset = subset[["app_id", "app_name", "summary"]].copy()
final_subset.to_csv(OUTPUT_CSV, index=False)
print(f"Saved summaries for rows [{START_IDX}:{END_IDX}) to {OUTPUT_CSV}")


Using device: cuda
Summarizing rows [0:50) out of 8067


Summarizing games:   0%|          | 0/50 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (830 > 512). Running this sequence through the model will result in indexing errors


Saved summaries for rows [0:50) to /content/steam_game_summaries_prompting_0_50.csv
