In [3]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset


class SummaryDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size=128):
        self.block_size = block_size
        pth = file_path
        self.data = pd.read_csv(pth)
        self.dt1 = self.data['target_text']
        self.dt2 = self.data['input_text']
        self.data['input_text'].fillna(' ', inplace=True)
        self.tokenizer = tokenizer
        self.data['target_text'].fillna(' ', inplace=True)  
        self.data['target_text'] = self.dt1.astype(str)
        self.data['input_text'] = self.dt2.astype(str)

    def __len__(self):
        x = len(self.data)
        list1 = []
        list1.append(x)
        return list1[0]

    def __getitem__(self, idx):
        dt = self.data
        dt_get = dt.iloc[idx]
        entry = dt_get
        summary = entry['target_text']
        summ = summary
        text = entry['input_text']
        # text_encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.block_size, return_tensors='pt')
        # summary_encoding = self.tokenizer(summary, truncation=True, padding='max_length', max_length=self.block_size, return_tensors='pt')
        encodings = self.tokenizer(text, text_target=summary, max_length=self.block_size, truncation=True, padding='max_length',return_tensors='pt')
        enc_inp = encodings['input_ids']
        enc_att = encodings['attention_mask']
        enc_lbl = encodings['labels']
        return {
            'input_ids': enc_inp.squeeze(),
            'attention_mask': enc_att.squeeze(),
            'labels': enc_lbl.squeeze()
        }

def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    train_dataset = SummaryDataset(tokenizer, train_file_path)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    st = save_steps
    epch = num_train_epochs
    list_n=[st, epch,per_device_train_batch_size, overwrite_output_dir]
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=list_n[3],
        per_device_train_batch_size=list_n[2],
        num_train_epochs=list_n[1],
        save_steps=list_n[0],
        learning_rate = 0.0001
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset
    )
    trainer.train()
    trainer.save_model('final_model_aditya_new')

f_path = '/Users/adityaarya/Documents/ir_final/train.csv'
step = 1000
out_dir = './model_output_aditya_new'
epc = 8

train(
    train_file_path=f_path,
    model_name='gpt2',
    output_dir=out_dir,
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    num_train_epochs=epc,
    save_steps=step
)


 10%|█         | 500/5000 [03:53<35:13,  2.13it/s]

{'loss': 3.2479, 'grad_norm': 6.245762825012207, 'learning_rate': 9e-05, 'epoch': 0.8}


 20%|██        | 1000/5000 [08:00<33:34,  1.99it/s]

{'loss': 1.7892, 'grad_norm': 5.9073309898376465, 'learning_rate': 8e-05, 'epoch': 1.6}


 30%|███       | 1500/5000 [12:09<27:53,  2.09it/s]  

{'loss': 1.1689, 'grad_norm': 6.99597692489624, 'learning_rate': 7e-05, 'epoch': 2.4}


 40%|████      | 2000/5000 [16:08<22:28,  2.22it/s]

{'loss': 0.869, 'grad_norm': 4.955573081970215, 'learning_rate': 6e-05, 'epoch': 3.2}


 50%|█████     | 2500/5000 [19:54<18:45,  2.22it/s]

{'loss': 0.6284, 'grad_norm': 3.5745744705200195, 'learning_rate': 5e-05, 'epoch': 4.0}


 60%|██████    | 3000/5000 [23:42<15:33,  2.14it/s]

{'loss': 0.4568, 'grad_norm': 4.833413124084473, 'learning_rate': 4e-05, 'epoch': 4.8}


 70%|███████   | 3500/5000 [27:34<11:35,  2.16it/s]

{'loss': 0.314, 'grad_norm': 4.83131217956543, 'learning_rate': 3e-05, 'epoch': 5.6}


 80%|████████  | 4000/5000 [31:26<07:43,  2.16it/s]

{'loss': 0.2461, 'grad_norm': 2.3752689361572266, 'learning_rate': 2e-05, 'epoch': 6.4}


 90%|█████████ | 4500/5000 [35:20<03:51,  2.16it/s]

{'loss': 0.1682, 'grad_norm': 2.923539876937866, 'learning_rate': 1e-05, 'epoch': 7.2}


100%|██████████| 5000/5000 [39:08<00:00,  2.12it/s]

{'loss': 0.1267, 'grad_norm': 1.4117860794067383, 'learning_rate': 0.0, 'epoch': 8.0}


100%|██████████| 5000/5000 [39:10<00:00,  2.13it/s]


{'train_runtime': 2350.535, 'train_samples_per_second': 17.017, 'train_steps_per_second': 2.127, 'train_loss': 0.9015250480651855, 'epoch': 8.0}


In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer():
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    return tokenizer

def generate_text(sequence, desired_word_count=15):
    model_path = "final_model_aditya_new"
    model = load_model(model_path)
    tokenizer = load_tokenizer()
    
    ids = tokenizer.encode(sequence, return_tensors='pt')
    
    max_model_length = 1024
    generation_length = max_model_length - ids.size(1)
    
    estimated_tokens_needed = desired_word_count * 4
    
    if estimated_tokens_needed > generation_length:
        estimated_tokens_needed = generation_length  
    
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=ids.size(1) + estimated_tokens_needed,  
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    generated_text = tokenizer.decode(final_outputs[0], skip_special_tokens=True)

    return generated_text

query = "great taffi great price wide assort yummi"
generated_text = generate_text(query, 10)
print(generated_text)


great taffi great price wide assort yummi wide assort yummi wide assort yumori wide assort yumori wide assort yumori wide assort yumori wide assort yumori wide assori wide


In [5]:
from tqdm import tqdm 
import builtins
import pandas as pd

d1 = pd.read_csv('/Users/adityaarya/Documents/ir_final/test.csv')

summaries_and_generated = []
list_reviews = []

for index, row in tqdm(d1.iterrows(), total=d1.shape[0], desc="Generating Text"):
    review = row['input_text'].split(': ')[1]
    review = review.split('summary:')[0]
    summary = row['target_text']
    generated_text = generate_text(review, 10)
    
    summaries_and_generated.append([summary, generated_text])
    list_reviews.append(review)

Generating Text: 100%|██████████| 1667/1667 [32:14<00:00,  1.16s/it]


In [6]:
from rouge import Rouge

def calculate_rouge_scores(predictions, references):
    predictions = [str(p) for p in predictions]
    references = [str(r) for r in references]

    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return scores

for i in range(len(summaries_and_generated)):
    try:
        predicted_summaries = [str(summaries_and_generated[i][1])]  
        reference_summaries = [str(summaries_and_generated[i][0])]  

        scores = calculate_rouge_scores(predicted_summaries, reference_summaries)
        
        print(f"Pair {i+1}: {scores}")
    except Exception as e:
        print(f"Error processing pair {i+1}: {e}")


Pair 1: {'rouge-1': {'r': 0.2, 'p': 0.06666666666666667, 'f': 0.09999999625000015}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.2, 'p': 0.06666666666666667, 'f': 0.09999999625000015}}
Pair 2: {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}
Pair 3: {'rouge-1': {'r': 0.6666666666666666, 'p': 0.06896551724137931, 'f': 0.12499999830078126}, 'rouge-2': {'r': 0.5, 'p': 0.023809523809523808, 'f': 0.04545454458677687}, 'rouge-l': {'r': 0.6666666666666666, 'p': 0.06896551724137931, 'f': 0.12499999830078126}}
Pair 4: {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}
Pair 5: {'rouge-1': {'r': 1.0, 'p': 0.23076923076923078, 'f': 0.374999996953125}, 'rouge-2': {'r': 1.0, 'p': 0.125, 'f': 0.2222222202469136}, 'rouge-l': {'r': 1.0, 'p': 0.23076923076923078, 'f': 0.374999996953125}}
Pair 6: {'rouge-1': {'r': 1.0, 'p': 0.0512

In [7]:
from rouge import Rouge

def calculate_rouge_scores(predictions, references):
    predictions = [str(p) for p in predictions]
    references = [str(r) for r in references]

    rouge = Rouge()
    scores = rouge.get_scores(predictions, references, avg=True)
    return scores

max_score = 0
best_pair_index = -1

for i in range(len(summaries_and_generated)):
    try:
        predicted_summaries = [str(summaries_and_generated[i][1])]  
        reference_summaries = [str(summaries_and_generated[i][0])]  

        scores = calculate_rouge_scores(predicted_summaries, reference_summaries)
        
        current_score = scores['rouge-1']['f']
        if current_score > max_score:
            max_score = current_score
            best_pair_index = i

    except Exception as e:
        print(f"Error processing pair {i+1}: {e}")

if best_pair_index != -1:  
    print(f"Best ROUGE-1 F-score is {max_score} for Pair {best_pair_index + 1}")
else:
    print("No valid scores were processed.")


Best ROUGE-1 F-score is 0.6153846111242604 for Pair 294


In [8]:
import csv
with open('summaries_and_generated_after_hyper_2.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Reference Summary', 'Generated Summary'])
    writer.writerows(summaries_and_generated)

In [9]:
average_score = 0
current_score = 0
for i in range(len(summaries_and_generated)):
    try:
        predicted_summaries = [str(summaries_and_generated[i][1])]  
        reference_summaries = [str(summaries_and_generated[i][0])]  

        scores = calculate_rouge_scores(predicted_summaries, reference_summaries)
        
        current_score += scores['rouge-1']['f']

    except Exception as e:
        print(f"Error processing pair {i+1}: {e}")
average_score = current_score/1667
print("Average Rouge F1: ", average_score)

Average Rouge F1:  0.09353495498087325


In [10]:
query = input("Please enter the Review Text: ")
summary_1 = input("Please enter the summary: ")
generated_text_1 = generate_text(query)
print("Generated Query: ", generated_text_1)

rouge = Rouge()
score = rouge.get_scores(generated_text_1, summary_1, avg=True)
print(score)

Generated Query:  The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability. etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc etc
{'rouge-1': {'r': 0.5555555555555556, 'p': 0.14285714285714285, 'f': 0.22727272401859508}, 'rouge-2': {'r': 0.125, 'p': 0.02702702702702703, 'f': 0.04444444152098785}, 'rouge-l': {'r': 0.4444444444444444, 'p': 0.11428571428571428, 'f': 0.1818181785640496}}
