In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# 4.1 implement few shot prompting

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

# TODO: Prepare the few-shot prompt with examples
few_shot_prompt = """
Here is a classification of the review of some movies and their sentiment which is either positive or negative:

Review: The movie was fantastic, with stunning visuals and a great story.
Sentiment: Positive
Review: The acting was top-notch.
Sentiment: Positive
Review: I found the film boring and overly long. Not worth watching.
Sentiment: Negative
Review: The characters were shallow, and the story was predictable.
Sentiment: Negative
"""

# TODO: Define a function to get GPT-2 prediction for a new sentence
def get_gpt2_prediction(sentence):
    prompt = few_shot_prompt + f"Review: {sentence}\nSentiment:"
    
    # Encode with padding
    inputs = tokenizer(prompt, return_tensors='pt', padding=True)
    
    # Pass attention_mask and pad_token_id to generate()
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=inputs['input_ids'].shape[1] + 10,
        do_sample=False,
        num_beams=5,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Convert generated text to lowercase for more robust matching
    generated_text = tokenizer.decode(outputs[0]).lower()
    #print("\n\n", generated_text, "\n\n")
    generated_text = "review:" + generated_text.split("review:")[-1]
    #print(generated_text)

    
    # Post-process to extract the sentiment prediction
    if "positive" in generated_text:
        print('positive')
        return "Positive"
    elif "negative" in generated_text:
        print('negative')
        return "Negative"
    else:
        return "Unknown"



In [3]:
test_reviews = [
    "I loved the cinematography and the acting was brilliant.",
    "The story was boring and the pacing was way too slow.",
    "A complete masterpiece! I would watch it again.",
    "Terrible movie, very shallow and i cringed all the time. I regret wasting my time on it.",
    'The movie was fantastic, i liked a lot of stuff from it.'
]

# Test each review
for review in test_reviews:
    sentiment = get_gpt2_prediction(review)
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}\n")

negative
Review: I loved the cinematography and the acting was brilliant.
Predicted Sentiment: Negative

negative
Review: The story was boring and the pacing was way too slow.
Predicted Sentiment: Negative

positive
Review: A complete masterpiece! I would watch it again.
Predicted Sentiment: Positive

negative
Review: Terrible movie, very shallow and i cringed all the time. I regret wasting my time on it.
Predicted Sentiment: Negative

negative
Review: The movie was fantastic, i liked a lot of stuff from it.
Predicted Sentiment: Negative



# 4.2 generate predictions

In [4]:
# Load IMDb dataset
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("imdb")
test_df = pd.DataFrame(dataset['test'])

In [5]:
len(test_df[test_df['text'].str.len() < 400])

1732

In [6]:
test_df = test_df[test_df['text'].str.len() < 400]

In [7]:
# Keep only 50 of each
sample_label_0 = test_df[test_df['label'] == 0].sample(n=573, random_state=42)
sample_label_1 = test_df[test_df['label'] == 1].sample(n=483, random_state=42)
test_df = pd.concat([sample_label_0, sample_label_1])
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
test_df['GPT2_predictions'] = test_df['text'].apply(lambda x: get_gpt2_prediction(x))

negative
negative
positive
negative
negative
negative
negative
positive
negative
negative
positive
negative
negative
positive
positive
negative
negative
negative
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
negative
negative
positive
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
negative
negative
negative
negative
positive
negative
negative
negative
negative
negative
negative
positive
positive
negative
negative
negative
negative
positive
negative
negative
negative
negative
negative
positive
negative
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
positive
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
positive
negative
positive
positive
negative
negative
n

In [9]:
test_df.to_parquet('test_df.parquet')

# 4.3 compare results

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
fine_tuned_model_name = 'C:\\Users\\wheus\\Downloads\\fine_tuned_bert_imdb\\fine_tuned_bert_imdb'  # Replace with your fine-tuned model path
fine_tuned_model = BertForSequenceClassification.from_pretrained(fine_tuned_model_name, output_attentions=True)

def get_finetuned_predictions(sentence):
    # Example input sentence
    inputs = tokenizer(sentence, return_tensors="pt")

    # Forward pass through the model to get outputs
    outputs = fine_tuned_model(**inputs)
    # Get the logits
    logits = outputs.logits

    # Apply softmax to get probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1)

    # Get the predicted class (0 or 1)
    pred_label = torch.argmax(probs, dim=1).item()
    return pred_label



In [11]:
test_df['finetuned_predictions'] = test_df['text'].apply(lambda x: get_finetuned_predictions(x))
test_df.to_parquet('test_df.parquet')

In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
pretrained_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
def get_pretrained_predictions(sentence):
    # Example input sentence
    inputs = tokenizer(sentence, return_tensors="pt")

    # Forward pass through the model to get outputs
    outputs = pretrained_model(**inputs)
    # Get the logits
    logits = outputs.logits

    # Apply softmax to get probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1)

    # Get the predicted class (0 or 1)
    pred_label = torch.argmax(probs, dim=1).item()
    return pred_label

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [13]:
test_df['pretrained_predictions'] = test_df['text'].apply(lambda x: get_pretrained_predictions(x))
test_df.to_parquet('test_df.parquet')

In [16]:
test_df['GPT2_predictions'] = test_df['GPT2_predictions'].replace({'Positive': 1, 'Negative': 0, 'Unknown': 0})

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming df is your DataFrame
# Ground truth labels
true_labels = test_df['label']

# Predictions from each model
gpt2_preds = test_df['GPT2_predictions']
pretrained_preds = test_df['pretrained_predictions']
finetuned_preds = test_df['finetuned_predictions']

# Function to calculate and print metrics
def calculate_metrics(true_labels, predictions, model_name):
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='binary')
    recall = recall_score(true_labels, predictions, average='binary')
    f1 = f1_score(true_labels, predictions, average='binary')
    
    print(f"\nMetrics for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }

# Calculate metrics for each model
gpt2_metrics = calculate_metrics(true_labels, gpt2_preds, "GPT-2 Few-Shot")
pretrained_metrics = calculate_metrics(true_labels, pretrained_preds, "Pre-trained BERT")
finetuned_metrics = calculate_metrics(true_labels, finetuned_preds, "Fine-tuned BERT")

# Combine metrics into a DataFrame for comparison
import pandas as pd

results_df = pd.DataFrame([gpt2_metrics, pretrained_metrics, finetuned_metrics])
print("\nComparative Results:")
print(results_df)
print(results_df.to_latex(index=False))



Metrics for GPT-2 Few-Shot:
Accuracy: 0.5786
Precision: 0.6105
Recall: 0.2174
F1-Score: 0.3206

Metrics for Pre-trained BERT:
Accuracy: 0.4593
Precision: 0.4581
Recall: 0.9959
F1-Score: 0.6275

Metrics for Fine-tuned BERT:
Accuracy: 0.9366
Precision: 0.9143
Recall: 0.9503
F1-Score: 0.9320

Comparative Results:
              Model  Accuracy  Precision    Recall  F1-Score
0    GPT-2 Few-Shot  0.578598   0.610465  0.217391  0.320611
1  Pre-trained BERT  0.459280   0.458095  0.995859  0.627528
2   Fine-tuned BERT  0.936553   0.914343  0.950311  0.931980
\begin{tabular}{lrrrr}
\toprule
           Model &  Accuracy &  Precision &   Recall &  F1-Score \\
\midrule
  GPT-2 Few-Shot &  0.578598 &   0.610465 & 0.217391 &  0.320611 \\
Pre-trained BERT &  0.459280 &   0.458095 & 0.995859 &  0.627528 \\
 Fine-tuned BERT &  0.936553 &   0.914343 & 0.950311 &  0.931980 \\
\bottomrule
\end{tabular}



  print(results_df.to_latex(index=False))


In [55]:
results_df.to_parquet('results_df.parquet')

# 4.5 attention map for GPT-2

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', output_attentions=True)
model.eval()

def plot_attention_map(model, tokenizer, input_sentence, plot_name):
    inputs = tokenizer(input_sentence, return_tensors="pt")
    outputs = model(**inputs)
    attentions = outputs.attentions 
    last_layer_attention = attentions[-1].squeeze(0).detach().numpy()

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist())

    def plot_attention_one_head(attention, tokens, title):
        clean_tokens = [token.replace("Ġ", "") for token in tokens]
        fig, ax = plt.subplots(figsize=(10, 10))
        im = ax.imshow(attention, cmap="viridis")
        ax.set_xticks(range(len(clean_tokens)))
        ax.set_yticks(range(len(clean_tokens)))
        ax.set_xticklabels(clean_tokens, rotation=90)
        ax.set_yticklabels(clean_tokens)
        plt.colorbar(im, ax=ax)
        plt.title(title)
        plt.show()

    plot_attention_one_head(last_layer_attention[0], tokens, plot_name)

input_sentence = "Great film, would really recommend!"
plot_name = "Attention map: Pretrained GPT-2 model. \nSentence: Great film, would really recommend!"


plot_attention_map(model, tokenizer, input_sentence, plot_name)

input_sentence = "I hated every minute of it"
plot_name = "Attention map: Pretrained GPT-2 model. \nSentence: I hated every minute of it"

plot_attention_map(model, tokenizer, input_sentence, plot_name)

# Get some information from the train dataset

In [None]:
# Load IMDb dataset
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("imdb")
train_df = pd.DataFrame(dataset['train'])
sample_label_0 = train_df[train_df['label'] == 0].sample(n=2, random_state=42)
sample_label_1 = train_df[train_df['label'] == 1].sample(n=2, random_state=42)
train_df = pd.concat([sample_label_0, sample_label_1])
#train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

for index, row in train_df.iterrows():
    text = row['text']
    print(f'Review: {text}')
    sentiment = 'Positive' if row['label'] == 1 else 'Negative'
    print(f'Predicted Sentiment: {sentiment}')