In [25]:
import pandas as pd
from transformers import AutoTokenizer
import torch

In [11]:
!pip install -q transformers datasets peft accelerate sentencepiece evaluate scikit-learn



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
DATA_PATH = "../data/processed/summary_labeled.csv"  # change if needed

df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["summary"])

print(df.shape)
df.head()


(50, 3)


Unnamed: 0,app_id,app_name,summary
0,10,Counter-Strike,Counter-Strike is widely regarded as a classic...
1,20,Team Fortress Classic,Team Fortress Classic is regarded by many as a...
2,30,Day of Defeat,Reviewers generally find Day of Defeat to be a...
3,40,Deathmatch Classic,Deathmatch Classic is widely regarded as a fai...
4,50,Half-Life: Opposing Force,Half-Life: Opposing Force receives mixed revie...


In [14]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

dataset = DatasetDict({
    "train": train_ds,
    "test": test_ds,
})

dataset


DatasetDict({
    train: Dataset({
        features: ['app_id', 'app_name', 'summary'],
        num_rows: 45
    })
    test: Dataset({
        features: ['app_id', 'app_name', 'summary'],
        num_rows: 5
    })
})

In [16]:
MODEL_NAME = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [18]:
MAX_INPUT = 512
MAX_TARGET = 160

def preprocess_batch(batch):
    inputs = ["summarize reviews: " + text for text in batch["summary"]]
    targets = batch["summary"]

    model_inputs = tokenizer(
        inputs,
        truncation=True,
        max_length=MAX_INPUT,
        padding="max_length",
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            truncation=True,
            max_length=MAX_TARGET,
            padding="max_length",
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(
    preprocess_batch,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

tokenized


Map:   0%|          | 0/45 [00:00<?, ? examples/s]



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 45
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
})

In [19]:
from transformers import AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType

base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"],
)

model = get_peft_model(base_model, config)
model.print_trainable_parameters()


config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 688,128 || all params: 77,649,280 || trainable%: 0.8862


In [21]:
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    labels[labels == -100] = tokenizer.pad_token_id

    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    refs  = tokenizer.batch_decode(labels, skip_special_tokens=True)

    preds = [p.strip() for p in preds]
    refs  = [r.strip() for r in refs]

    result = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
    return {k: round(v * 100, 2) for k, v in result.items()}


In [29]:
import transformers
print(transformers.__version__)
print("Transformers version:", transformers.__version__)
print("Transformers module path:", transformers.__file__)



4.57.1
Transformers version: 4.57.1
Transformers module path: d:\Projects\NLP Steam Review\Steam-Review-NLP-Pipeline\venv\Lib\site-packages\transformers\__init__.py


In [30]:
import sys
print(sys.executable)


d:\Projects\NLP Steam Review\Steam-Review-NLP-Pipeline\venv\Scripts\python.exe


In [31]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="models/t5_lora_steam",
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=50,
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    # NOTE: no evaluation_strategy, no save_strategy, no predict_with_generate here
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],  # optional, for manual eval later
    tokenizer=tokenizer,
    data_collator=data_collator,
    # no compute_metrics — keep it simple for now
)


  trainer = Seq2SeqTrainer(


In [32]:
trainer.train()




Step,Training Loss


'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: b8bc61e2-96a0-40af-b6d4-33aaf868f7c5)')' thrown while requesting HEAD https://huggingface.co/google/flan-t5-small/resolve/main/config.json
Retrying in 1s [Retry 1/5].


TrainOutput(global_step=36, training_loss=2.0004725986056857, metrics={'train_runtime': 117.5928, 'train_samples_per_second': 1.148, 'train_steps_per_second': 0.306, 'total_flos': 25380598579200.0, 'train_loss': 2.0004725986056857, 'epoch': 3.0})

In [35]:
SAVE_DIR = "../models/t5_lora_steam"

model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("Saved LoRA adapter + tokenizer to:", SAVE_DIR)


Saved LoRA adapter + tokenizer to: ../models/t5_lora_steam


In [10]:
def build_lora_prompt(all_reviews: str, app_name: str):
    prompt = (
        f"You are summarizing community opinions about the game '{app_name}'. "
        f"Below is a collection of user reviews. "
        f"Your goal is to write a balanced, neutral summary that captures the common themes across multiple reviews.\n\n"

        f"Instructions:\n"
        f"- Focus on overall player sentiment, not a single review.\n"
        f"- Highlight gameplay, controls, pacing, difficulty, graphics, performance, audio, and overall enjoyment *only if mentioned*.\n"
        f"- If the reviews contradict each other, acknowledge both sides briefly.\n"
        f"- If reviews are very short or low-quality, provide the most reasonable interpretation.\n"
        f"- Do NOT copy or paraphrase any single review.\n"
        f"- Do NOT include slang, insults, or emotional rants.\n"
        f"- Do NOT invent details.\n"
        f"- Keep the tone calm, factual, and third-person.\n"
        f"- Write 2–4 sentences.\n\n"

        f"User Reviews:\n{all_reviews}\n\n"
        f"Summary:"
    )

    return prompt


In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import pandas as pd
from tqdm import tqdm

# paths
BASE_MODEL = "google/flan-t5-small"
ADAPTER_DIR = "../models/t5_lora_steam"   # where LoRA weights were saved
INPUT_CSV = "../data/processed/grouped_summary.csv"  # your original grouped review dataset
OUTPUT_CSV = "../data/processed/final_summary_finetuned2.csv"

# Select range
START_IDX = 0
END_IDX = len(df)   # Change to len(df) to do all

device = "cuda" if torch.cuda.is_available() else "cpu"

# load base model + LoRA adapter
print("Loading base model...")
base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(device)

print("Loading tokenizer + LoRA adapter...")
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR)
model = PeftModel.from_pretrained(base, ADAPTER_DIR).to(device)

model.eval()

# load review data
df = pd.read_csv(INPUT_CSV)
subset = df.iloc[START_IDX:END_IDX].copy()

def lora_summarize(all_reviews: str, app_name: str):
    # Build prompt
    prompt = build_lora_prompt(all_reviews, app_name)

    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)

    # Generate
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=160,
            num_beams=4,
            do_sample=False,
            no_repeat_ngram_size=3,
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()


print("Generating fine-tuned summaries...")
summaries = []

for idx, row in tqdm(subset.iterrows(), total=len(subset)):
    all_reviews = str(row["all_reviews"])
    app_name = str(row["app_name"])
    summaries.append(lora_summarize(all_reviews, app_name))
    # Save progress every 100 rows
    if (idx + 1) % 400 == 0:
        subset_partial = subset.iloc[:idx+1].copy()
        subset_partial["summary_finetuned"] = summaries
        final_subset = subset_partial[["app_id", "app_name", "summary_finetuned"]].copy()

        out_path = f"../data/processed/fine_lora_partial_{idx+1}.csv"
        final_subset.to_csv(out_path, index=False)
        print(f"Saved partial results at row {idx+1} → {out_path}")
       

subset["summary_finetuned"] = summaries

subset.to_csv(OUTPUT_CSV, index=False)
print("Saved fine-tuned summaries to:", OUTPUT_CSV)


Loading base model...
Loading tokenizer + LoRA adapter...
Generating fine-tuned summaries...


  5%|▍         | 400/8067 [26:28<6:51:45,  3.22s/it] 

Saved partial results at row 400 → ../data/processed/fine_lora_partial_400.csv


 10%|▉         | 800/8067 [48:34<6:15:00,  3.10s/it]

Saved partial results at row 800 → ../data/processed/fine_lora_partial_800.csv


 15%|█▍        | 1200/8067 [1:09:44<6:45:32,  3.54s/it]

Saved partial results at row 1200 → ../data/processed/fine_lora_partial_1200.csv


 20%|█▉        | 1600/8067 [1:32:37<5:38:55,  3.14s/it]

Saved partial results at row 1600 → ../data/processed/fine_lora_partial_1600.csv


 25%|██▍       | 2000/8067 [1:54:54<5:55:10,  3.51s/it]

Saved partial results at row 2000 → ../data/processed/fine_lora_partial_2000.csv


 30%|██▉       | 2400/8067 [2:16:57<5:00:56,  3.19s/it]

Saved partial results at row 2400 → ../data/processed/fine_lora_partial_2400.csv


 35%|███▍      | 2800/8067 [2:39:13<4:50:46,  3.31s/it]

Saved partial results at row 2800 → ../data/processed/fine_lora_partial_2800.csv


 40%|███▉      | 3200/8067 [3:01:44<4:43:39,  3.50s/it]

Saved partial results at row 3200 → ../data/processed/fine_lora_partial_3200.csv


 45%|████▍     | 3600/8067 [3:23:52<3:26:26,  2.77s/it]

Saved partial results at row 3600 → ../data/processed/fine_lora_partial_3600.csv


 50%|████▉     | 4000/8067 [3:46:14<4:04:10,  3.60s/it]

Saved partial results at row 4000 → ../data/processed/fine_lora_partial_4000.csv


 55%|█████▍    | 4400/8067 [4:07:55<3:35:17,  3.52s/it]

Saved partial results at row 4400 → ../data/processed/fine_lora_partial_4400.csv


 60%|█████▉    | 4800/8067 [4:29:42<3:24:08,  3.75s/it]

Saved partial results at row 4800 → ../data/processed/fine_lora_partial_4800.csv


 64%|██████▍   | 5200/8067 [4:51:06<2:45:23,  3.46s/it]

Saved partial results at row 5200 → ../data/processed/fine_lora_partial_5200.csv


 69%|██████▉   | 5600/8067 [5:13:00<2:23:34,  3.49s/it]

Saved partial results at row 5600 → ../data/processed/fine_lora_partial_5600.csv


 74%|███████▍  | 6000/8067 [5:34:31<1:51:57,  3.25s/it]

Saved partial results at row 6000 → ../data/processed/fine_lora_partial_6000.csv


 79%|███████▉  | 6400/8067 [5:56:06<1:22:20,  2.96s/it]

Saved partial results at row 6400 → ../data/processed/fine_lora_partial_6400.csv


 84%|████████▍ | 6800/8067 [6:17:47<1:03:25,  3.00s/it]

Saved partial results at row 6800 → ../data/processed/fine_lora_partial_6800.csv


 89%|████████▉ | 7200/8067 [6:37:59<34:14,  2.37s/it]  

Saved partial results at row 7200 → ../data/processed/fine_lora_partial_7200.csv


 94%|█████████▍| 7600/8067 [6:57:44<21:42,  2.79s/it]

Saved partial results at row 7600 → ../data/processed/fine_lora_partial_7600.csv


 99%|█████████▉| 8000/8067 [7:16:54<03:06,  2.78s/it]

Saved partial results at row 8000 → ../data/processed/fine_lora_partial_8000.csv


100%|██████████| 8067/8067 [7:19:55<00:00,  3.27s/it]


Saved fine-tuned summaries to: ../data/processed/final_summary_finetuned2.csv


In [16]:
final_subset = subset[["app_id", "app_name", "summary_finetuned"]].copy()
final_subset.to_csv(OUTPUT_CSV, index=False)
print("Saved fine-tuned summaries to:", OUTPUT_CSV)

Saved fine-tuned summaries to: ../data/processed/final_summary_finetuned2.csv
