In [6]:
import pandas as pd
import torch
import re

# Load the .txt file
file_path = 'train.ft.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    raw_reviews = [next(file).strip() for _ in range(400)]  
    #reviews = file.readlines()

def clean_review(review):
    # Remove label (e.g., __label__2) and keep only the review text
    review_text = re.sub(r'__label__\d+', '', review).strip()
    review_text = re.sub(r'\s+', ' ', review_text)  # Remove extra spaces
    
    return review_text

# Clean the first 5 reviews
cleaned_reviews = [clean_review(review) for review in raw_reviews]

print(len(cleaned_reviews[0:25]))
cleaned_reviews[270:300]

25


['False advertising: If you search for Super Saver shipping this advertisement comes up, but this product is not shipped by Amazon is does not go out with the Super Saver shipping, and the cost of shipping is nearly 20% the cost of the pants.',
 'Don\'t Trust the Images: Dockers are Dockers, so that\'s not the problem. Just be sure you don\'t trust the images if you choose to buy these via Amazon. "Cafe," for example, should be "Coffee." Despite the fact that the image for "Cafe" shows a stone or very light gray color, "Cafe" is actually dark brown. So the product isn\'t defective, but IS misleading: I didn\'t want brown pants, but that\'s what I got.',
 'So disappointed: I have worn these pants for years; my size is the same as always. I couldn\'t get them on; they were so tight I couldn\'t get the button buttoned around the waist. What is going on? These pants seemed to be cut for a skinny 25 year old, not someone who is a little on the hefty side. Further, the shipping and handling 

In [24]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 tokenizer for preprocessing input data
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Prepare the input data by adding a prefix for summarization
inputs = ["summarize: " + review for review in cleaned_reviews]  # Add 'summarize:' prefix to each review

# Tokenize the input data, handling truncation and padding
# This converts text into input IDs and attention masks suitable for the model
tokenized_inputs = tokenizer(inputs, truncation=True, padding=True, return_tensors='pt')

# Determine the split size for the training and validation sets (80% train, 20% validation)
train_size = int(0.8 * len(tokenized_inputs['input_ids']))

# Split the tokenized inputs into training and validation datasets
# For training inputs, take the first 80% of the data
train_inputs = {k: v[:train_size] for k, v in tokenized_inputs.items()}

# For validation inputs, take the remaining 20% of the data
val_inputs = {k: v[train_size:] for k, v in tokenized_inputs.items()}

# model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [25]:
from transformers import T5ForConditionalGeneration
from torch.optim import AdamW

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

# Optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=5e-5)

In [26]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'])
val_dataset = TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [27]:
def train_model(model, train_loader, val_loader, epochs=3):
    # Set the model to training mode, enabling features like dropout
    model.train()

    for epoch in range(epochs):
        total_loss = 0  # Initialize total loss for the epoch

        # Iterate over batches of training data from the DataLoader
        for batch in train_loader:
            # Move input tensors to the specified device
            input_ids, attention_mask = [x.to(device) for x in batch]

            # Zero the gradients before the backward pass (calculating the gradients of the loss function with respect to the model parameters (weights)
            #after performing a forward pass through the network)
            #if you don’t zero the gradients before the backward pass, the gradients from the previous batch will be added to the gradients of the current batch.
            optimizer.zero_grad()

            #The gradients of the loss with respect to each weight are calculated using backpropagation. These gradients essentially tell us in which direction 
            #(positive or negative) to adjust the weights to reduce the loss.
            #For example, if increasing a certain weight makes the loss larger, the gradient will be positive, and the optimizer will reduce 
            #the value of that weight.

            # Perform a forward pass: compute the model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)

            # Get the loss value from the model's output
            loss = outputs.loss #cross-entropy loss

            #Model Output: Let's say the model predicts a distribution over words like:

            #"The" (target word): 0.6
            #"dog": 0.3
            #"ran": 0.1 The correct word is "The", so the loss penalizes the fact that the model's confidence in predicting "The" was only 60% (instead of 100%).
            #Cross-Entropy Calculation: For this word, the loss would be:
            #−log(0.6)≈0.51
            #This value is added to the losses from other tokens in the sequence, and the average or sum of all token losses is the final loss for that sequence.
            #total loss is sum of losses

            # Backpropagate the loss to compute gradients
            loss.backward()

            # Update the model parameters using the optimizer
            optimizer.step()

            # Accumulate the total loss for this epoch
            total_loss += loss.item()

        # Calculate the average loss for the epoch
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1} - Loss: {avg_loss}")  # Print the average loss for the epoch

        # Validate the model on the validation dataset after each epoch
        validate_model(model, val_loader)

def validate_model(model, val_loader):
    # Set the model to evaluation mode, disabling dropout and other training-specific features
    model.eval()
    total_val_loss = 0  # Initialize total validation loss

    # Disable gradient calculation for validation
    with torch.no_grad():
        # Iterate over batches of validation data from the DataLoader
        for batch in val_loader:
            # Move input tensors to the specified device (e.g., GPU or CPU)
            input_ids, attention_mask = [x.to(device) for x in batch]

            # Perform a forward pass: compute the model outputs for validation
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)

            # Accumulate the total validation loss
            total_val_loss += outputs.loss.item()

    # Calculate the average validation loss
    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss}")  # Print the average validation loss


In [28]:
train_model(model, train_loader, val_loader, epochs=5)

Epoch 1 - Loss: 7.350502252578735
Validation Loss: 7.960514068603516
Epoch 2 - Loss: 5.589021921157837
Validation Loss: 6.08193826675415
Epoch 3 - Loss: 3.5920020937919617
Validation Loss: 4.239217281341553
Epoch 4 - Loss: 1.9119222164154053
Validation Loss: 2.6748788356781006
Epoch 5 - Loss: 1.046639308333397
Validation Loss: 1.8805192708969116


In [30]:
def generate_summary(review):
    inputs = tokenizer.encode("summarize: " + review, return_tensors='pt').to(device)
    summary_ids = model.generate(inputs, max_length=50, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example
new_review = "One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too many of those kinds of songs in my other video game soundtracks. I must admit that one of the songs (Life-A Distant Promise) has brought tears to my eyes on many occasions.My one complaint about this soundtrack is that they use guitar fretting effects in many of the songs, which I find distracting. But even if those weren't included I would still consider the collection worth it."
print(generate_summary(new_review))


one complaint about this soundtrack is that they use guitar fretting effects in many of the songs, which I find distracting. but even if those weren't included I would still consider the collection worth it.


In [31]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
reference = "I have only played a small portion of the game but the music i heard made it worth purchasing and it remains one of my favorite albums."
summary = generate_summary(new_review)
scores = scorer.score(reference, summary)
print(scores)


{'rouge1': Score(precision=0.25, recall=0.3333333333333333, fmeasure=0.28571428571428575), 'rougeL': Score(precision=0.16666666666666666, recall=0.2222222222222222, fmeasure=0.1904761904761905)}


In [None]:
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

import torch.onnx

dummy_input = torch.randn(1, 512).to(device)
torch.onnx.export(quantized_model, dummy_input, "t5_model.onnx")

from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/summarize', methods=['POST'])
def summarize():
    data = request.get_json()
    review = data['review']
    summary = generate_summary(review)
    return jsonify({'summary': summary})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)
