This IPYNB file provides a comprehensive exploration of the impact of sentiment fine-tuning on text summarization, utilizing the kindle_reviews.csv dataset. The process includes loading and splitting the data into training and testing sets, applying Hugging Face's sentiment classifier to categorize each review, and fine-tuning a T5 model based on these sentiment classifications. The notebook then generates summaries for the test dataset, offering insights into how sentiment-based fine-tuning alters the summarization process compared to traditional methods, with parameters adjustable to fit various runtime and analysis requirements.







In [1]:
!pip install transformers datasets rouge-score torch sentencepiece accelerate



In [8]:
import pandas as pd
import csv
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset, DataLoader

df = pd.read_csv("kindle_reviews.csv", error_bad_lines=False, nrows=1000)
df.head()



  df = pd.read_csv("kindle_reviews.csv", error_bad_lines=False, nrows=1000)


Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [9]:
# trying different sentiment analysis pipelines
financial_news_pipeline = pipeline(model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
general_news_pipeline = pipeline(model="shashanksrinath/News_Sentiment_Analysis")
twitter_pipeline = pipeline(model="cardiffnlp/twitter-roberta-base-sentiment")
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [13]:
error_count = 0
num_pos = 0
num_neg = 0
tot = 0
def get_sentiment(text):
    global error_count, num_pos, num_neg, tot
    try:
        result = sentiment(text)
        label = result[0]['label']
        tot += 1
        num_pos += 1 if label == 'POSITIVE' else 0
        num_neg += 1 if label == 'NEGATIVE' else 0
        return label
    except Exception as e:
        # print(f"Error: {str(e)}. Text: {text}.")
        error_count += 1
        return None

df['sentiment'] = df['reviewText'].apply(get_sentiment)
#df['summary_sentiment'] = df['summary'].apply(get_sentiment)

print(df['sentiment'])
print(f'{error_count} rows threw an exception.')
print(f"num positive = {num_pos}\n Num negative = {num_neg}, tot={tot}")
print(f"Percent positive input text = {100 * num_pos / tot}\n Percent negative input text = {100 * num_neg / tot}")
#print(f"Percent positive gold-summary= {100 * num_pos / tot}\n Percent negative gold-summary= {100 * num_neg / tot}")



0      POSITIVE
1      POSITIVE
2      POSITIVE
3      POSITIVE
4      POSITIVE
         ...   
995    POSITIVE
996    POSITIVE
997    POSITIVE
998    NEGATIVE
999    POSITIVE
Name: sentiment, Length: 1000, dtype: object
31 rows threw an exception.
num positive = 753
 Num negative = 216, tot=969
Percent positive input text = 77.70897832817337
 Percent negative input text = 22.291021671826627


In [14]:
df_positive = df[df['sentiment'] == 'POSITIVE'].drop(columns=['sentiment'])
df_negative = df[df['sentiment'] == 'NEGATIVE'].drop(columns=['sentiment'])
df_neutral = df[df['sentiment'] == 'NEUTRAL'].drop(columns=['sentiment'])

# df_summary_positive = df[df['summary_sentiment'] == 'POSITIVE'].drop(columns=['summary_sentiment'])
# df_summary_negative = df[df['summary_sentiment'] == 'NEGATIVE'].drop(columns=['summary_sentiment'])
# df_summary_neutral = df[df['summary_sentiment'] == 'NEUTRAL'].drop(columns=['summary_sentiment'])

print(f'There are {len(df_positive)} positive rows, {len(df_negative)} negative rows, and {len(df_neutral)} neutral rows when measuring sentiment of the input review text.')
#print(f'There are {len(df_summary_positive)} positive rows, {len(df_summary_negative)} negative rows, and {len(df_summary_neutral)} neutral rows when measuring sentiment of the gold standard summaries.')

There are 753 positive rows, 216 negative rows, and 0 neutral rows when measuring sentiment of the input review text.


In [None]:
class KindleReviewDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        text = item['reviewText']
        summary = item['summary']
        inputs = self.tokenizer.encode_plus(
            text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')
        targets = self.tokenizer.encode_plus(
            summary, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }

In [None]:
r = 0.95  # Fraction of data for training

# Function to split data
def split_data(dataframe, train_frac):
    train_size = int(len(dataframe) * train_frac)
    train_df = dataframe[:train_size]
    test_df = dataframe[train_size:]
    return train_df, test_df

# Splitting the datasets
df_positive = df[df['sentiment'] == 'POSITIVE'].drop(columns=['sentiment'])
df_negative = df[df['sentiment'] == 'NEGATIVE'].drop(columns=['sentiment'])
df_full = df.drop(columns=['sentiment'])  # Full dataset

print(len(df_positive))
print(len(df_negative))
print(len(df_full))

train_df_full, test_df_full = split_data(df_full, r)
train_df_positive, test_df_positive = split_data(df_positive, r)
train_df_negative, test_df_negative = split_data(df_negative, r)


In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Prepare training datasets
train_dataset_full = KindleReviewDataset(tokenizer, train_df_full)
train_dataset_positive = KindleReviewDataset(tokenizer, train_df_positive)
train_dataset_negative = KindleReviewDataset(tokenizer, train_df_negative)

# Prepare testing datasets
# These will be used later for evaluation
test_dataset_full = KindleReviewDataset(tokenizer, test_df_full)
test_dataset_positive = KindleReviewDataset(tokenizer, test_df_positive)
test_dataset_negative = KindleReviewDataset(tokenizer, test_df_negative)

In [None]:
from transformers import Trainer, TrainingArguments
import torch

print(df.columns)
print(len(df))

# important to use google collab torch instead (if we want to pay, >>)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

# fine tuning a bit more aggressively
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,  # Adjusted learning rate
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="no",  # No evaluation dataset
    gradient_accumulation_steps=2,  # Adjust if needed
    seed=42  # Set a random seed
)


In [None]:
def train_and_save_model(train_dataset, model_name):
    model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )
    trainer.train()
    model.save_pretrained(f"./{model_name}")

# Train and save each model
train_and_save_model(train_dataset_full, "model_full")
train_and_save_model(train_dataset_positive, "model_positive")
train_and_save_model(train_dataset_negative, "model_negative")

In [None]:
import json
from transformers import pipeline
from datasets import load_metric

# Function to summarize text
def summarize_text(text, tokenizer, model):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Function to calculate ROUGE scores
def calculate_rouge(predictions, references):
    rouge = load_metric("rouge")
    return rouge.compute(predictions=predictions, references=references)


In [None]:
# Load trained models
model_full = T5ForConditionalGeneration.from_pretrained('./model_full').to(device)
model_positive = T5ForConditionalGeneration.from_pretrained('./model_positive').to(device)
model_negative = T5ForConditionalGeneration.from_pretrained('./model_negative').to(device)

In [None]:
# # WARNING -> overwriting kindle test data with news test data

# test_data = []
# with open('test.jsonl', 'r') as file:
#     for i, line in enumerate(file):
#         if i < 300:
#             test_data.append(json.loads(line))
#         else:
#             break

# test_df_full = pd.DataFrame(test_data)
# test_df_full['sentiment'] = test_df_full['text'].apply(get_sentiment)
# test_df_full=test_df_full.rename(columns = {'text':'reviewText'})

# # Prepare testing datasets
# test_df_positive = test_df_full[test_df_full['sentiment'] == 'POSITIVE'].drop(columns=['sentiment'])
# test_df_negative = test_df_full[test_df_full['sentiment'] == 'NEGATIVE'].drop(columns=['sentiment'])

# # These will be used later for evaluation
# test_dataset_full = KindleReviewDataset(tokenizer, test_df_full)
# test_dataset_positive = KindleReviewDataset(tokenizer, test_df_positive)
# test_dataset_negative = KindleReviewDataset(tokenizer, test_df_negative)


In [None]:
test_df_full.head()

In [None]:
print(f'There are {len(test_df_positive)} positive rows, {len(test_df_negative)} negative rows, and {len(test_df_full)} total rows')

In [None]:
# Function to summarize a dataset using a model
def summarize_dataset(dataset, tokenizer, model): # CHANGES DEPENDING ON WHICH TEST SET
    return [summarize_text(text, tokenizer, model) for text in dataset.data['reviewText']]


# Summarize test data
summaries_full = {
    'full': summarize_dataset(test_dataset_full, tokenizer, model_full),
    'positive': summarize_dataset(test_dataset_full, tokenizer, model_positive),
    'negative': summarize_dataset(test_dataset_full, tokenizer, model_negative)
}

print('done with full summaries')

summaries_positive = {
    'full': summarize_dataset(test_dataset_positive, tokenizer, model_full),
    'positive': summarize_dataset(test_dataset_positive, tokenizer, model_positive),
    'negative': summarize_dataset(test_dataset_positive, tokenizer, model_negative)
}

print('done with positive summaries')

summaries_negative = {
    'full': summarize_dataset(test_dataset_negative, tokenizer, model_full),
    'positive': summarize_dataset(test_dataset_negative, tokenizer, model_positive),
    'negative': summarize_dataset(test_dataset_negative, tokenizer, model_negative)
}

print('done with negative summaries')


In [None]:
# Function to analyze sentiment distribution
def analyze_sentiment_distribution(summaries):
    distribution = {}
    for model_type in summaries:
        positive_count = sum(1 for summary in summaries[model_type] if sentiment(summary)[0]['label'] == 'POSITIVE')
        total_count = len(summaries[model_type])
        distribution[model_type] = {
            'positive_percentage': (positive_count / total_count) * 100,
            'negative_percentage': 100 - (positive_count / total_count) * 100
        }
    return distribution

# Analyze sentiment distribution
distribution_full = analyze_sentiment_distribution(summaries_full)
distribution_positive = analyze_sentiment_distribution(summaries_positive)
distribution_negative = analyze_sentiment_distribution(summaries_negative)


In [None]:
# Mapping number of words -> change in sentiment

# adjusts so that 0 - 1.0 is negative; 1.0 - 2.0 is positive
def adjust_score(label, score):
  if label == 'POSITIVE':
    return score + 1.0
  else:
    return score

# coordinates for overall plot (adding 3 diff datapoints for positive/negative/full)
x = []
y = []

# coordinates for average plot (averaging the 3 datapoints)
avg_x = []
avg_y = []

def word_to_score(data, summaries):
  global x, y, avg_x, avg_y
  for i, row in enumerate(data.iterrows()):
        original_review = row[1]['reviewText']
        num_words = len(original_review.split(' '))

        original_sentiment = sentiment(row[1]['summary'])[0]
        original_label, original_score = original_sentiment['label'], original_sentiment['score']

        count = 0
        for model_type in summaries: # positive, negative, and full
          generated_summary = summaries[model_type][i]
          generated_sentiment = sentiment(generated_summary)[0]
          generated_label, generated_score = generated_sentiment['label'], generated_sentiment['score']

          x.append(num_words)
          y.append(adjust_score(original_label, original_score)/adjust_score(generated_label, generated_score))

          count += adjust_score(generated_label, generated_score)

        avg_x.append(num_words)
        avg_y.append(count / 3)

word_to_score(test_df_positive, summaries_positive)
word_to_score(test_df_negative, summaries_negative)
word_to_score(test_df_full, summaries_full)


In [None]:
import matplotlib.pyplot as plt

# plotting average

plt.scatter(avg_x, avg_y)

plt.xlabel('Number of Words in Text')
plt.ylabel('Average Change in Sentiment')
plt.title('Averaged Plot')

plt.xlim((0,1500)) # did this to cut out outliers on x axis


plt.show()

In [None]:
# plotting normal

plt.scatter(x, y)

plt.xlabel('Number of Words in Text')
plt.ylabel('Change in Sentiment')
plt.title('Plot')

plt.xlim((0,1500)) # did this to cut out outliers on x axis

plt.show()

In [None]:
print(summaries_full)

In [None]:
test_df_positive.head()

In [None]:
output_file = "positive_data_output.txt"
# train_df_full, test_df_full = split_data(df_full, r)
# train_df_positive, test_df_positive = split_data(df_positive, r)
# train_df_negative, test_df_negative = split_data(df_negative, r)
# Open the file in write mode
with open(output_file, "w") as file:
    file.write(f"POSITIVE DATA:\n")
    file.write("\n------------------------------------------------------\n\n")

    for i, row in enumerate(test_df_positive.iterrows()):
        original_review = row[1]['reviewText']
        original_summary = row[1]['summary']

        positive_summary = summaries_positive['positive'][i]
        negative_summary = summaries_positive['negative'][i]
        full_summary = summaries_positive['full'][i]

        # Write all information to file
        file.write(f"Review: \n{original_review}\n\n")
        file.write(f"Original Summary: \n{original_summary}\n\n")
        file.write(f"Positive Model Summary: \n{positive_summary}\n\n")
        file.write(f"Negative Model Summary: \n{negative_summary}\n\n")
        file.write(f"Full Model Summary: \n{full_summary}\n\n")
        file.write("\n------------------------------------------------------\n\n")

output_file = "negative_data_output.txt"
with open(output_file, "w") as file:
    file.write(f"NEGATIVE DATA:\n")
    file.write("\n------------------------------------------------------\n\n")

    for i, row in enumerate(test_df_negative.iterrows()):
        original_review = row[1]['reviewText']
        original_summary = row[1]['summary']

        positive_summary = summaries_negative['positive'][i]
        negative_summary = summaries_negative['negative'][i]
        full_summary = summaries_negative['full'][i]

        # Write all information to file
        file.write(f"Review: \n{original_review}\n\n")
        file.write(f"Original Summary: \n{original_summary}\n\n")
        file.write(f"Positive Model Summary: \n{positive_summary}\n\n")
        file.write(f"Negative Model Summary: \n{negative_summary}\n\n")
        file.write(f"Full Model Summary: \n{full_summary}\n\n")
        file.write("\n------------------------------------------------------\n\n")

output_file = "full_data_output.txt"
with open(output_file, "w") as file:
    file.write(f"FULL DATA:\n")
    file.write("\n------------------------------------------------------\n\n")

    for i, row in enumerate(test_df_full.iterrows()):
        original_review = row[1]['reviewText']
        original_summary = row[1]['summary']

        positive_summary = summaries_full['positive'][i]
        negative_summary = summaries_full['negative'][i]
        full_summary = summaries_full['full'][i]

        # Write all information to file
        file.write(f"Review: \n{original_review}\n\n")
        file.write(f"Original Summary: \n{original_summary}\n\n")
        file.write(f"Positive Model Summary: \n{positive_summary}\n\n")
        file.write(f"Negative Model Summary: \n{negative_summary}\n\n")
        file.write(f"Full Model Summary: \n{full_summary}\n\n")
        file.write("\n------------------------------------------------------\n\n")



In [None]:
# Display results
def display_results(distribution, test_type):
    print(f"Results for {test_type} Test Data:")
    for model_type, dist in distribution.items():
        print(f"  Summaries by {model_type} Model - Positive: {dist['positive_percentage']:.2f}%, Negative: {dist['negative_percentage']:.2f}%")
    print()

display_results(distribution_full, "Full")
display_results(distribution_positive, "Positive")
display_results(distribution_negative, "Negative")
