In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv("/content/drive/MyDrive/IR_Assignment4/Reviews.csv")
df.head()

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:
import pandas as pd
import re
import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

file_path = '/content/drive/MyDrive/IR_Assignment4/Reviews.csv'
df = pd.read_csv(file_path)

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = [token.lemma_ for token in nlp(text) if not token.is_stop]
    return ' '.join(tokens)

df['Summary'] = df['Summary'].apply(preprocess_text)
df['Text'] = df['Text'].apply(preprocess_text)

output_file_path = '/content/drive/MyDrive/IR_Assignment4/preprocess.csv'
df.to_csv(output_file_path, index=False)


In [None]:
df=pd.read_csv('/content/drive/MyDrive/IR_Assignment4/preprocess.csv')
df.head()

In [None]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

file_path = '/content/drive/MyDrive/IR_Assignment4/preprocess.csv'
df = pd.read_csv(file_path)

def tokenize_text(text):
    if pd.isna(text):
        return []
    tokens = [token.lemma_ for token in nlp(text) if not token.is_stop]
    return tokens

df['Summary_tokens'] = df['Summary'].apply(tokenize_text)
df['Text_tokens'] = df['Text'].apply(tokenize_text)

output_file_path = '/content/drive/MyDrive/IR_Assignment4/preprocess_tokens.csv'
df.to_csv(output_file_path, index=False)


In [None]:
df=pd.read_csv('/content/drive/MyDrive/IR_Assignment4/preprocess_tokens.csv')
df.head()

In [28]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from rouge import Rouge
import csv

In [10]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [50]:
data_path = '/kaggle/input/preprocess/preprocess.csv'
review_data = pd.read_csv(data_path).head(10000)
selected_columns = ['Score', 'Text', 'Summary']
training_data, testing_data = train_test_split(review_data[selected_columns], test_size=0.25, random_state=42)


In [51]:
class ReviewSummarizationDataset(Dataset):
    def __init__(self, dataset_frame, text_encoder, max_seq_length):
        self.dataset_frame = dataset_frame
        self.text_encoder = text_encoder
        self.max_seq_length = max_seq_length
        # Set padding token explicitly
        self.text_encoder.pad_token = self.text_encoder.eos_token  # Set padding token to end of sequence token

    def __len__(self):
        return len(self.dataset_frame)

    def __getitem__(self, index):
        review_content = str(self.dataset_frame.iloc[index]['Text'])
        summary_content = str(self.dataset_frame.iloc[index]['Summary'])

        # Combine review content and summary content
        full_text = f"Review Content: {review_content}\nSummary: {summary_content}"

        # Tokenize the full text
        tokenized_inputs = self.text_encoder.encode_plus(
            full_text,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        token_ids = tokenized_inputs['input_ids'].squeeze(0)
        mask = tokenized_inputs['attention_mask'].squeeze(0)

        # Convert score to tensor
        target_label = torch.tensor(self.dataset_frame.iloc[index]['Score'])

        return {
            'input_ids': token_ids,
            'attention_mask': mask,
            'label': target_label
        }


In [52]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')

training_dataset = ReviewSummarizationDataset(training_data, gpt2_tokenizer, max_seq_length=128)
training_dataloader = DataLoader(training_dataset, batch_size=10, shuffle=True)


In [53]:
learning_rate = 1e-5
num_epochs = 3
warmup_steps = int(0.1 * len(training_dataloader) * num_epochs)
optimizer = AdamW(gpt2_model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(training_dataloader) * num_epochs)




In [54]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2_model.to(device)
gpt2_model.train()

for epoch in range(num_epochs):
    for batch in training_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = gpt2_model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        training_loss = outputs.loss
        training_loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {training_loss.item()}")


Epoch 1/3, Loss: 1.9871684312820435
Epoch 2/3, Loss: 2.309363842010498
Epoch 3/3, Loss: 2.6320555210113525


In [55]:
review_data_path = '/kaggle/input/preprocess/preprocess.csv'
output_file = 'rouge_scores.csv' 
num_rows_to_read = 200

In [56]:
def generate_summary(review_text):
    inputs = gpt2_tokenizer.encode_plus(
        review_text,
        return_tensors="pt",
        max_length=1024,
        truncation=True
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
    fine_tuned_model = GPT2LMHeadModel.from_pretrained('fine_tuned_gpt2_2').to(device)
    summary_ids = fine_tuned_model.generate(inputs['input_ids'], max_length=1024, num_beams=4, early_stopping=True)
    generated_summary = gpt2_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return generated_summary

In [57]:
# Function to calculate ROUGE scores
def calculate_rouge(generated_summary, actual_summary):
    if not generated_summary:  # Check if the generated summary is empty
        return {'rouge-1': {'p': 0, 'r': 0, 'f': 0},  # Assign zero scores
                'rouge-2': {'p': 0, 'r': 0, 'f': 0},
                'rouge-l': {'p': 0, 'r': 0, 'f': 0}}
    rouge = Rouge()
    rouge_scores = rouge.get_scores(generated_summary, actual_summary)
    return rouge_scores[0]

In [None]:
rouge = Rouge()

In [62]:
with open(output_file, mode='w', newline='', encoding='utf-8') as output_csv:
    csv_writer = csv.writer(output_csv)
    csv_writer.writerow(['Text', 'Generated Summary', 'ROUGE-1 Precision', 'ROUGE-1 Recall', 'ROUGE-1 F1',
                         'ROUGE-2 Precision', 'ROUGE-2 Recall', 'ROUGE-2 F1',
                         'ROUGE-L Precision', 'ROUGE-L Recall', 'ROUGE-L F1'])

    # Open CSV file and iterate over rows, starting from start_row and stopping at end_row
    with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for idx, row in enumerate(csv_reader):
            if idx + 1 < start_row:
                continue  # Skip rows until start_row is reached
            if idx + 1 > end_row:
                break  # Stop reading after reaching end_row

            review_text = row['Text']
            actual_summary = row['Summary']  # Adjust column name

            # Skip rows with empty actual summary
            if not actual_summary:
                print(f"Skipping row {idx + 1} due to empty Summary.")
                continue

            # Generate summary
            g_summary = generate_summary(review_text)
            split_summary = g_summary.split(review_text)
            generated_summary = split_summary[1].strip()

            # Calculate ROUGE scores
            rouge_scores = calculate_rouge(generated_summary, actual_summary)

            # Write results to output file
            csv_writer.writerow([review_text, generated_summary,
                                 rouge_scores['rouge-1']['p'], rouge_scores['rouge-1']['r'], rouge_scores['rouge-1']['f'],
                                 rouge_scores['rouge-2']['p'], rouge_scores['rouge-2']['r'], rouge_scores['rouge-2']['f'],
                                 rouge_scores['rouge-l']['p'], rouge_scores['rouge-l']['r'], rouge_scores['rouge-l']['f']])

print("ROUGE scores calculated and saved to", output_file)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Skipping row 132 due to empty Summary.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

ROUGE scores calculated and saved to rouge_scores.csv
