In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
from tqdm.auto import tqdm
from pathlib import Path
device = "cuda" if torch.cuda.is_available() else "cpu"


In [5]:
CSV_PATH = Path("../../data/processed/mockup_data.csv")

df = pd.read_csv(CSV_PATH)
print(df.columns)
df.head()


Index(['app_id', 'app_name', 'review_text', 'review_score', 'review_votes'], dtype='object')


Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Classic FPS with tight gunplay and fast-paced ...,1,1
1,10,Counter-Strike,Too many hackers lately but still enjoyable wi...,0,1
2,10,Counter-Strike,The maps are well-designed but the learning cu...,1,0
3,10,Counter-Strike,I keep coming back to this game after years.,1,1
4,10,Counter-Strike,Needs better matchmaking but gameplay is unmat...,1,0


In [6]:
# Make sure app_name is not null
df["app_name"] = df["app_name"].fillna("Unknown")

MAX_REVIEWS_PER_GAME = 20

df_small = df.groupby(["app_id", "app_name"]).head(MAX_REVIEWS_PER_GAME)

grouped = (
    df_small.groupby(["app_id", "app_name"])["review_text"]
      .apply(lambda x: " ||| ".join(x.astype(str)))
      .reset_index()
      .rename(columns={"review_text": "all_reviews"})
)

final_grouped = grouped[["app_id", "app_name", "all_reviews"]].copy()
print("Grouped shape:", final_grouped.shape)
final_grouped.head()


Grouped shape: (10, 3)


Unnamed: 0,app_id,app_name,all_reviews
0,10,Counter-Strike,Classic FPS with tight gunplay and fast-paced ...
1,20,Team Fortress 2,Fun team-based gameplay with unique classes. |...
2,30,Skyrim,Huge open world with endless quests. ||| Comba...
3,40,Minecraft,Creative and relaxing building experience. |||...
4,50,Stardew Valley,Amazing farming and relationship mechanics. ||...


In [7]:
BASE_MODEL_NAME = "google/flan-t5-small"

MAX_INPUT_TOKENS = 512
MAX_NEW_TOKENS = 160

print("Loading base FLAN-T5-small...")
base_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME).to(device)
base_model.eval()


Loading base FLAN-T5-small...


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [8]:
def build_prompt(all_reviews: str, app_name: str) -> str:
    if len(all_reviews) > 3000:
        all_reviews_trimmed = all_reviews[:3000] + "\n\n[Reviews truncated]"
    else:
        all_reviews_trimmed = all_reviews

    prompt = (
        f"You are summarizing community opinions about the game '{app_name}'. "
        f"Below is a collection of user reviews. "
        f"Your goal is to write a balanced, neutral summary that captures the common themes across multiple reviews.\n\n"

        f"Instructions:\n"
        f"- Focus on overall player sentiment, not a single review.\n"
        f"- Highlight gameplay, controls, pacing, difficulty, graphics, performance, audio, and overall enjoyment only if mentioned.\n"
        f"- If the reviews contradict each other, briefly acknowledge both sides.\n"
        f"- If reviews are very short or low-quality, provide the most reasonable interpretation.\n"
        f"- Do NOT copy or paraphrase any single review.\n"
        f"- Do NOT include slang, insults, or emotional rants.\n"
        f"- Do NOT invent details.\n"
        f"- Keep the tone calm, factual, and third-person.\n"
        f"- Write 2–4 sentences.\n\n"

        f"User Reviews:\n{all_reviews_trimmed}\n\n"
        f"Summary:"
    )

    return prompt


In [9]:
def summarize_with_base(all_reviews: str, app_name: str) -> str:
    prompt = build_prompt(all_reviews, app_name)

    inputs = base_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_TOKENS,
    ).to(device)

    with torch.no_grad():
        output_ids = base_model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            num_beams=4,
            do_sample=False,
            no_repeat_ngram_size=3,
        )

    return base_tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()


In [9]:
ADAPTER_DIR = Path("../../models/checkpoints/t5_lora_steam")

print("LoRA adapter dir:", ADAPTER_DIR.resolve(), "exists:", ADAPTER_DIR.exists())
if not ADAPTER_DIR.exists():
    raise FileNotFoundError(f"LoRA adapter directory not found at: {ADAPTER_DIR.resolve()}")


LoRA adapter dir: D:\Projects\NLP Steam Review\Steam-Review-NLP-Pipeline\models\checkpoints\t5_lora_steam exists: True


In [11]:
print("Loading base model for LoRA...")
ft_base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME).to(device)

print("Loading tokenizer + LoRA adapter...")
ft_tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR)
ft_model = PeftModel.from_pretrained(ft_base_model, ADAPTER_DIR).to(device)
ft_model.eval()


Loading base model for LoRA...
Loading tokenizer + LoRA adapter...


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=384, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=384, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
             

In [12]:
def summarize_with_finetuned(all_reviews: str, app_name: str) -> str:
    prompt = build_prompt(all_reviews, app_name)

    inputs = ft_tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_TOKENS,
    ).to(device)

    with torch.no_grad():
        output_ids = ft_model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            num_beams=4,
            do_sample=False,
            no_repeat_ngram_size=3,
        )

    return ft_tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()


In [13]:
from tqdm.auto import tqdm

START_IDX = 0
END_IDX = 10 # or e.g. 50, 100, etc.

subset = final_grouped.iloc[START_IDX:END_IDX].copy()
print(f"Processing rows [{START_IDX}:{END_IDX}) → {len(subset)} games")


Processing rows [0:10) → 10 games


In [14]:
base_summaries = []
ft_summaries = []

for _, row in tqdm(subset.iterrows(), total=len(subset), desc="Summarizing with base & LoRA"):
    app_name = str(row["app_name"])
    all_reviews = str(row["all_reviews"])

    base_summaries.append(summarize_with_base(all_reviews, app_name))
    ft_summaries.append(summarize_with_finetuned(all_reviews, app_name))

subset["summary_base"] = base_summaries
subset["summary_finetuned"] = ft_summaries
subset.head()


Summarizing with base & LoRA:   0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,app_id,app_name,all_reviews,summary_base,summary_finetuned
0,10,Counter-Strike,The best :) ||| Best version of Counter Strike...,– It was the first FPS game I ever played and ...,– It was the first FPS game I ever played and ...
1,20,Team Fortress Classic,"She's dead, Jim. ||| Great game. This is a cla...","– Team Fortress Classic is a classic game, and...","– Team Fortress Classic is a classic game, and..."
2,30,Day of Defeat,Great game. It's a shame people try to cheat y...,"– Thsi isn't a bad game, but it's a good game....",– Thsi is really Counter-strike world war 2 wi...
3,40,Deathmatch Classic,"Quake 1 multiplayer sure is fun, and so is Dea...",– Deathmatch Classic isn't exactly the same as...,– Deathmatch Classic is the standard Half-Life...
4,50,Half-Life: Opposing Force,"Like Half Life, but with more weapons and alie...","Good story: New areas, weapons, enemies","Good story: New areas, weapons, enemies"


In [None]:
final_grouped.loc[START_IDX:END_IDX-1, "summary_base"] = subset["summary_base"].values
final_grouped.loc[START_IDX:END_IDX-1, "summary_finetuned"] = subset["summary_finetuned"].values

OUTPUT_CSV = Path("../../data/processed/final_summary_both_models.csv")
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
final_grouped.to_csv(OUTPUT_CSV, index=False)

print("Saved summaries for both models to:", OUTPUT_CSV)


Saved summaries for both models to: ..\data\processed\final_summary_both_models.csv
