In [1]:
!rm -rf /content/Steam-Review-NLP-Pipeline
!git clone https://github.com/chankiel/Steam-Review-NLP-Pipeline.git /content/Steam-Review-NLP-Pipeline

Cloning into '/content/Steam-Review-NLP-Pipeline'...
remote: Enumerating objects: 132, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (95/95), done.[K
remote: Total 132 (delta 40), reused 111 (delta 22), pack-reused 0 (from 0)[K
Receiving objects: 100% (132/132), 1.58 MiB | 22.53 MiB/s, done.
Resolving deltas: 100% (40/40), done.


In [2]:
import gdown
# https://drive.google.com/file/d/1yA3RR861M-rWCvVTZ-i_HhvhRNYn2xpF/view?usp=sharing
url = "https://drive.google.com/uc?id=1yA3RR861M-rWCvVTZ-i_HhvhRNYn2xpF"
output = "/content/summarizer.csv"

gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1yA3RR861M-rWCvVTZ-i_HhvhRNYn2xpF
To: /content/summarizer.csv
100%|██████████| 45.9M/45.9M [00:00<00:00, 193MB/s]


'/content/summarizer.csv'

In [3]:
import sys, os

src_root = "/content/Steam-Review-NLP-Pipeline/src"
sys.path.insert(0, src_root)   # put it at the front

print("First 5 sys.path entries:")
print("\n".join(sys.path[:5]))

print("\nDoes src_root exist?", os.path.exists(src_root))
print("Contents of src_root:", os.listdir(src_root))

First 5 sys.path entries:
/content/Steam-Review-NLP-Pipeline/src
/content
/env/python
/usr/lib/python312.zip
/usr/lib/python3.12

Does src_root exist? True
Contents of src_root: ['summarizer', 'training', 'inference', 'api', 'common', 'preprocess', 'utils']


In [4]:
preprocess_dir = "/content/Steam-Review-NLP-Pipeline/src/preprocess"

import os
print("Does preprocess dir exist?", os.path.exists(preprocess_dir))
if os.path.exists(preprocess_dir):
    print("Contents of preprocess dir:", os.listdir(preprocess_dir))

Does preprocess dir exist? True
Contents of preprocess dir: ['sampling.py', '__init__.py', 'filtering.py', '__pycache__', 'sample_review.py', 'group_summarizer.py', 'cleaning.py']


In [5]:
import pandas as pd
import torch
torch.cuda.empty_cache()
from preprocess.group_summarizer import group_reviews
from utils.batching import batch_iter

from summarizer.pegasus_summarizer import PegasusSummarizer
# from summarizer.textrank_summarizer import TextRankSummarizer
# from summarizer.lstm_summarizer import LSTMSummarizer


In [6]:
INPUT_CSV = "/content/summarizer.csv"
df = pd.read_csv(INPUT_CSV)
required_cols = {"app_id", "app_name", "review_text"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing columns: {missing}")


In [23]:
grouped = group_reviews(df)
grouped = grouped.reset_index(drop=True)

len(grouped), grouped.head()

target_app_id = 20
app_reviews = df[df["app_id"] == target_app_id]["review_text"].dropna()

for i, review in enumerate(app_reviews, start=1):
    print(f"{i}. {review}\n")


1. tryed playing this game when i boot it up im only getting a black screen

2. crappy rip off of TF2 don t play it just joking play the game

3. The best part was the seizures on the server I played on 4 5 10 needs more seizures

4. my friend keeps annying me with it so i dont really like it its more meant as a joke

5. the only thing this game suceeds at for me is giving me a full blown headache from the fps and choppy animation

6. This game don t have on TF2 Shotty graphics broken characters and NO HATS! Go back to TF2 while you still have your faith in Valve

7. Learn about the history of TF2 or get a nostalgia trip I suck at the game so I can t reccomend it

8. I honestly don t think this game is even worth the 5 00 that it is There is such little content to go along with the fact that pretty much nobody plays it anymore There are also some critical balance issues with some aspects of the game that are either horribly underpowred and horribly overpowered One prime example of this

In [8]:
def get_summarizer(model_name: str):
    model_name = model_name.lower()
    if model_name == "pegasus":
        return PegasusSummarizer()
    # if model_name == "textrank":
    #     return TextRankSummarizer()
    # if model_name == "lstm":
    #     return LSTMSummarizer()
    raise ValueError(f"Unknown model: {model_name}")

In [None]:
from tqdm.notebook import tqdm
import math

def summarize_chunk(
    grouped_df,
    summarizer,
    start_idx: int,
    end_idx: int,
    batch_size: int,
    output_path: str,
    desc: str = "Summarizing chunk",
):
    """
    Summarize a slice of `grouped_df` from row [start_idx:end_idx]
    and save a CSV with columns: app_id, app_name, summary_review.

    - grouped_df: the full grouped dataframe (with clean_text)
    - summarizer: e.g. get_summarizer("pegasus")
    - start_idx, end_idx: row range (iloc style)
    - batch_size: small for Pegasus (1–2)
    - output_path: where to save the CSV
    - desc: label for tqdm progress bar
    """
    # take only the chunk we want
    chunk = grouped_df.iloc[start_idx:end_idx].copy()
    if len(chunk) == 0:
        print(f"No rows in range [{start_idx}, {end_idx}). Skipping.")
        return

    print(f"Chunk rows: {start_idx} to {end_idx-1} (total {len(chunk)})")
    
    indices = chunk.index.tolist()
    total_batches = math.ceil(len(indices) / batch_size)
    summaries = []
    batch_num = 0

    for idx_batch in tqdm(
        batch_iter(indices, batch_size),
        total=total_batches,
        desc=desc,
    ):
        batch_num += 1
        print(f"=== Batch {batch_num}/{total_batches} ===")

        texts = chunk.iloc[idx_batch]["clean_text"].tolist()
        batch_summaries = summarizer.summarize_batch(texts)
        summaries.extend(batch_summaries)

        # free VRAM between batches (important for Pegasus)
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # build result df for this chunk
    result_df = chunk[["app_id", "app_name"]].copy()
    result_df["summary_review"] = summaries

    result_df.to_csv(output_path, index=False)
    print(f"Saved chunk to: {output_path}")
    return result_df


In [13]:
import os
# e.g. 1 or 2 for Pegasus (to avoid OOM)
BATCH_SIZE_MODEL = 8

# Make sure `grouped` already exists (from your previous cells)
print("Total grouped games:", len(grouped))

os.makedirs("/content/Steam-Review-NLP-Pipeline/data/processed", exist_ok=True)

pegasus_summarizer = get_summarizer("pegasus")


Total grouped games: 8067


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
chunk_start = 0
chunk_end = 200   # up to but not including

output_path_1 = "/content/Steam-Review-NLP-Pipeline/data/processed/summaries_pegasus_1.csv"

df_chunk_1 = summarize_chunk(
    grouped_df=grouped,
    summarizer=pegasus_summarizer,
    start_idx=chunk_start,
    end_idx=chunk_end,
    batch_size=BATCH_SIZE_MODEL,
    output_path=output_path_1,
    desc="Summarizing chunk 1",
)


Chunk rows: 0 to 199 (total 200)


Summarizing chunk 1:   0%|          | 0/25 [00:00<?, ?it/s]

=== Batch 1/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 2/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 3/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 4/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 5/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 6/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 7/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 8/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 9/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 10/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 11/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 12/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 13/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 14/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 15/25 ===
[Pegasus] Running generation on 8 items...
=== Batch 16/25 ===
[Pegasus] Running generation on 8 items...
=

False
<module 'posixpath' (frozen)>
