In [18]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm.notebook import tqdm
import torch

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# load model and tokenizer
model_path = "/content/drive/MyDrive/t5_AirlineReviews_summary_model"
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_path)

def summarize_batch(reviews, max_input_length=512, max_output_length=64):
    input_texts = ["summarize: " + review for review in reviews]
    
    # batch tokenize
    inputs = tokenizer(
        input_texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_input_length
    ).to(device)

    # batch to summries
    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_output_length,
            num_beams=4,
            early_stopping=True
        )

        # 解碼結果
    summaries = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
    return summaries

In [13]:
data = pd.read_csv('../data/silver_20250322_Airline_Reviews_Corrected_WithSentimentImageLabels.csv')
X = data[['RowId','Review']]
X

Unnamed: 0,RowId,Review
0,0,Moroni to Moheli. Turned out to be a pretty ...
1,1,Moroni to Anjouan. It is a very small airline...
2,2,Anjouan to Dzaoudzi. A very small airline an...
3,3,Please do a favor yourself and do not fly wi...
4,4,Do not book a flight with this airline! My fr...
...,...,...
24349,24349,Bangkok to Tokyo. I’ve flown many low cost ai...
24350,24350,Avoid at all costs. I booked flights to go f...
24351,24351,Flight was leaving at 23.15 and after an hou...
24352,24352,Zipair is JAL’s budget airline. They don’t ha...


In [19]:
batch_size = 64
summaries = []

for i in tqdm(range(0, len(X), batch_size)):
    batch_end = min(i + batch_size, len(X))
    batch_reviews = X['Review'].iloc[i:batch_end].tolist()

    try:
        batch_summaries = summarize_batch(batch_reviews)
        summaries.extend(batch_summaries)
    except RuntimeError as e:
        print(f"Error processing batch {i} to {batch_end-1}: {e}")
        if "out of memory" in str(e):
            print("GPU memory exceeded. Reducing batch size or clearing memory...")
            torch.cuda.empty_cache()  # clean GPU memory
            batch_size = max(batch_size // 2, 1)  # dynamic reduce batch size
            batch_summaries = ["Error: OOM"] * len(batch_reviews)
            summaries.extend(batch_summaries)
        else:
            summaries.extend(["Error generating summary"] * len(batch_reviews))

    print(f"Processed reviews {i} to {batch_end-1}")

  0%|          | 0/2 [00:00<?, ?it/s]

Processed reviews 0 to 9
Processed reviews 10 to 19


In [None]:

# Add summaries to the dataframe
data['Summary'] = summaries
data.head()
data.to_csv('Gold_20250327_AirlineReviews_Sentiment_Image_Summary.csv', index=False)