In [None]:
import torch
import json
import math
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from tqdm.auto import tqdm


from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score

logging.set_verbosity_error()

In [None]:
finetuned_models = ['mistralai/Mistral-7B-Instruct-v0.2',
                    '/workspace/storage/fatemeh/organized_projects/NLP_hw3/models/Mistral-7B-instruct-sentiment-tuned', 
                    '/workspace/storage/fatemeh/organized_projects/NLP_hw3/models/Mistral-7B-sentiment-tuned']

# Evaluation for Sentiment Analysis

In [2]:
from datasets import load_dataset, Dataset, concatenate_datasets
import pandas as pd

twitter_dataset = "carblacac/twitter-sentiment-analysis"

train_len = 1000
test_len = 50

dataset = load_dataset(twitter_dataset)

test_twitter = pd.DataFrame(dataset['test'][:test_len])
test_twitter['feeling'] = test_twitter['feeling'].astype(str)

instructions = {
    'instruction_1': "Carefully read the following tweet. Assess the overall sentiment expressed by considering the choice of words and the tone. Label the tweet with '1' if the overall sentiment is positive, reflecting happiness or satisfaction, and '0' if it reflects a negative sentiment such as sadness or anger.",
    'instruction_2': "Analyze the mood conveyed in the tweet below, taking into account the language, emojis (if any), and contextual cues. Label the mood as '1' if the tweet communicates a positive, cheerful, or optimistic tone, and '0' if it conveys negativity, pessimism, or discontent."
}

def format_instruction(row, instruction):
    return f"{instruction}\n\n### Tweet: '{row['text']}'\n\n### Sentiment:"

test_twitter['input_2'] = test_twitter.apply(lambda x: format_instruction(x, instructions['instruction_2']), axis=1)

test_twitter.head()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Unnamed: 0,text,feeling,input_2
0,@justineville ...yeahhh. ) i'm 39 tweets from ...,1,"Analyze the mood conveyed in the tweet below, ..."
1,@ApplesnFeathers aww. Poor baby! On your only ...,0,"Analyze the mood conveyed in the tweet below, ..."
2,@joeymcintyre With my refunded $225 (Australia...,0,"Analyze the mood conveyed in the tweet below, ..."
3,It's fine. Today sucks just because me those t...,0,"Analyze the mood conveyed in the tweet below, ..."
4,"Im just chilling on psp and stuff, but sitting...",0,"Analyze the mood conveyed in the tweet below, ..."


In [10]:
print(test_twitter.iloc[0]['input_2'])

Analyze the mood conveyed in the tweet below, taking into account the language, emojis (if any), and contextual cues. Label the mood as '1' if the tweet communicates a positive, cheerful, or optimistic tone, and '0' if it conveys negativity, pessimism, or discontent.

### Tweet: '@justineville ...yeahhh. ) i'm 39 tweets from 1,600!'

### Sentiment:


In [11]:
# from datasets import Dataset
# hf_dataset = Dataset.from_pandas(train_df)

In [12]:
def generate_response(model, tokenizer, test_dataset):
    logging.set_verbosity_error()

    # pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, device=0)
    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0,
        max_new_tokens=2,
        # top_k=top_k,  
        # num_beams=num_beams,  
        # temperature=temperature,
        # do_sample=True
    )

    batch_size = 5

    num_examples = len(test_dataset)
    total_batches = (num_examples + batch_size - 1) // batch_size
    generated_output = []

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc="Generating text"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['input_2'] for example in batch]

        # Generate text for the batch
        results = pipe(prompts, max_new_tokens=2)
        
        for result in results:
            generated_text = result[0]['generated_text'].split('\n\n### Sentiment: ')[1]
            generated_output.append(generated_text)

            # Uncomment the following lines if you want to print the prompts and generated text
            # prompt = prompts[results.index(result)]
            # print(f"Prompt: {prompt}")
            # print(f"Generated Text: {generated_text}")
            # print("------")
    
    # return [output.split("### Response:\n")[1].split("\n\n### Instruction:")[0].strip() if "### Response:\n" in output else '' for output in generated_output]
    return generated_output


In [14]:
import json


for each_model in finetuned_models:

    model = AutoModelForCausalLM.from_pretrained(each_model)
    tokenizer = AutoTokenizer.from_pretrained(each_model)

    print(f'MODEL {each_model} START GENERATING.')
    test_twitter_hf = Dataset.from_pandas(test_twitter)

    generated_responses = generate_response(model, tokenizer, test_dataset=test_twitter_hf)

    with open(f'results/{each_model.split("/")[-1]}.json', 'w+') as f:
        json.dump(generated_responses, f, indent=4)

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.08s/it]


MODEL mistralai/Mistral-7B-Instruct-v0.2 START GENERATING.


Generating text: 100%|██████████| 10/10 [00:07<00:00,  1.26it/s]


In [16]:
results = ['/workspace/storage/fatemeh/organized_projects/NLP_hw3/results/Mistral-7B-instruct-sentiment-tuned.json',
           '/workspace/storage/fatemeh/organized_projects/NLP_hw3/results/Mistral-7B-sentiment-tuned.json',
           '/workspace/storage/fatemeh/organized_projects/NLP_hw3/results/Mistral-7B-Instruct-v0.2.json']

In [15]:
test_twitter.head()

Unnamed: 0,text,feeling,input_2
0,@justineville ...yeahhh. ) i'm 39 tweets from ...,1,"Analyze the mood conveyed in the tweet below, ..."
1,@ApplesnFeathers aww. Poor baby! On your only ...,0,"Analyze the mood conveyed in the tweet below, ..."
2,@joeymcintyre With my refunded $225 (Australia...,0,"Analyze the mood conveyed in the tweet below, ..."
3,It's fine. Today sucks just because me those t...,0,"Analyze the mood conveyed in the tweet below, ..."
4,"Im just chilling on psp and stuff, but sitting...",0,"Analyze the mood conveyed in the tweet below, ..."


In [17]:
def add_results_to_dataframe(filepath, dataframe):
    column_name = filepath.split('/')[-1].replace('.json', '')
    with open(filepath, 'r') as file:
        data = json.load(file)
    
    # Add data as a column to the dataframe
    dataframe[column_name] = data

# Process each result file
for result_path in results:
    add_results_to_dataframe(result_path, test_twitter)

test_twitter.head()

Unnamed: 0,text,feeling,input_2,Mistral-7B-instruct-sentiment-tuned,Mistral-7B-sentiment-tuned,Mistral-7B-Instruct-v0.2
0,@justineville ...yeahhh. ) i'm 39 tweets from ...,1,"Analyze the mood conveyed in the tweet below, ...",0,0,1
1,@ApplesnFeathers aww. Poor baby! On your only ...,0,"Analyze the mood conveyed in the tweet below, ...",0,0,0
2,@joeymcintyre With my refunded $225 (Australia...,0,"Analyze the mood conveyed in the tweet below, ...",0,0,0
3,It's fine. Today sucks just because me those t...,0,"Analyze the mood conveyed in the tweet below, ...",0,0,0
4,"Im just chilling on psp and stuff, but sitting...",0,"Analyze the mood conveyed in the tweet below, ...",0,0,1


In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def calculate_metrics(true_values, predictions):
    true_values = [int(x) for x in true_values]
    predictions = [int(x) for x in predictions]

    precision = precision_score(true_values, predictions)
    recall = recall_score(true_values, predictions)
    f1 = f1_score(true_values, predictions)
    accuracy = accuracy_score(true_values, predictions)
    return precision, recall, f1, accuracy

result_columns = [
    'Mistral-7B-instruct-sentiment-tuned',
    'Mistral-7B-sentiment-tuned',
    'Mistral-7B-Instruct-v0.2'
]

metrics = {}

for column in result_columns:
    p, r, f1, acc = calculate_metrics(test_twitter['feeling'], test_twitter[column])
    metrics[column] = {'Precision': p, 'Recall': r, 'F1 Score': f1, 'Accuracy': acc}

for model, scores in metrics.items():
    print(f"Metrics for {model}:")
    for metric, score in scores.items():
        print(f"{metric}: {score:.4f}")
    print() 

Metrics for Mistral-7B-instruct-sentiment-tuned:
Precision: 0.9048
Recall: 0.8261
F1 Score: 0.8636
Accuracy: 0.8800

Metrics for Mistral-7B-sentiment-tuned:
Precision: 0.9048
Recall: 0.8261
F1 Score: 0.8636
Accuracy: 0.8800

Metrics for Mistral-7B-Instruct-v0.2:
Precision: 0.8000
Recall: 0.8696
F1 Score: 0.8333
Accuracy: 0.8400



# Evaluation for Alpaca

In [6]:
alpace_dataset = "tatsu-lab/alpaca"

dataset_original = load_dataset(alpace_dataset)
train_alpaca = dataset_original['train'][:train_len]
test_alpaca = dataset_original['train'][test_len:test_len+25]
test_alpaca_hf = Dataset.from_pandas(pd.DataFrame(test_alpaca))

In [8]:
def generate_response(model, tokenizer, test_dataset):
    logging.set_verbosity_error()

    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0,
        # top_k=top_k,  
        # num_beams=num_beams,  
        # temperature=temperature  
    )

    batch_size = 10

    num_examples = len(test_dataset)
    total_batches = (num_examples + batch_size - 1) // batch_size
    generated_output = []

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc="Generating text"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'].split('\n\n### Response:\n')[0] for example in batch]

        # Generate text for the batch
        results = pipe(prompts, max_new_tokens=128)
        
        for result in results:
            generated_text = result[0]['generated_text']
            generated_output.append(generated_text)

            # Uncomment the following lines if you want to print the prompts and generated text
            prompt = prompts[results.index(result)]
            # print(f"Prompt: {prompt}")
            # print(f"Generated Text: {generated_text}")
            # print("------")
    
    return [output.split("### Response:\n")[1].split("\n\n### Instruction:")[0].strip() if "### Response:\n" in output else '' for output in generated_output]

In [13]:
import math

model_name = "gpt2"
model_gpt2 = GPT2LMHeadModel.from_pretrained(model_name).eval()
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained(model_name)

def calculate_perplexity(text):
    if len(text) == 0:
        print(f'THIS {text} RETURN ZERO')
        return 0

    tokenize_input = tokenizer_gpt2.encode(text, return_tensors='pt')
    with torch.no_grad():
        loss = model_gpt2(tokenize_input, labels=tokenize_input)[0]

    if not math.isnan(torch.exp(loss).item()):
        return torch.exp(loss).item()
    else:
        return 0


def calculate_bleu(reference, candidate):
    reference_tokens = [reference.split()]
    candidate_tokens = candidate.split()
    smoothie = SmoothingFunction().method1  # You can experiment with different smoothing methods
    return sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothie)

def calculate_rouge_l(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    return scorer.score(reference, candidate)['rougeL'].fmeasure

def calculate_bert_score(reference, candidate):
    *_, bert_scores = score([candidate], [reference], lang='en', return_hash=False)
    return bert_scores.mean().item()

def evaluate_text_quality(reference, candidate):
    return {
        'Perplexity': calculate_perplexity(candidate),
        'BLEU': calculate_bleu(reference, candidate),
        'ROUGE-L': calculate_rouge_l(reference, candidate),
        'BERTScore': calculate_bert_score(reference, candidate)
    }

# Example usage
# reference_text = "This is a sample reference text."
# generated_text = "This is a sample generated text."
# evaluation_results = evaluate_text_quality(reference_text, generated_text)
# print(evaluation_results)


def calculate_scores(test_dataset, generated_responses):
    """
        Return the scores based on some generated text and the ground truth
    """
    scores = {'Perplexity': 0, 'BLEU': 0, 'ROUGE-L': 0, 'BERTScore': 0}

    num_samples = len(test_dataset)

    for i, test_data in tqdm(enumerate(test_dataset)):
        evaluation_results = evaluate_text_quality(test_data['output'], generated_responses[i])
        for key in scores:
            scores[key] += evaluation_results[key]

    # Average the scores
    for key in scores:
        scores[key] /= num_samples

    return scores

In [None]:
import json


for each_model in finetuned_models:
    
    model = AutoModelForCausalLM.from_pretrained(each_model)
    tokenizer = AutoTokenizer.from_pretrained(each_model)

    print(f'MODEL {each_model} START GENERATING.')
    generated_responses = generate_response(model, tokenizer, test_dataset=test_alpaca_hf)

    with open(f'results/{each_model.split("/")[-1]}_alpaca.json', 'w+') as f:
        json.dump(generated_responses, f)
    print(f'MODEL {each_model} START CALCULATING SCORES.')
    scores_model = calculate_scores(test_dataset=test_alpaca_hf, generated_responses=generated_responses)

    with open(f'results/{each_model.split("/")[-1]}_alpaca_score.json', 'w+') as f:
        json.dump(scores_model, f)
    print(scores_model)

# Evaluation Out-of-Sample

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import torch
from tqdm import tqdm
import json

# Model paths
model_sentiment_tuned_path = '/workspace/storage/fatemeh/organized_projects/NLP_hw3/models/Mistral-7B-sentiment-tuned'
model_instruct_sentiment_path = '/workspace/storage/fatemeh/organized_projects/NLP_hw3/models/Mistral-7B-instruct-sentiment-tuned'

# Create out-of-sample instructions with input examples
instructions = [
    "### Instruction:\nSummarize the following text:\n\n### Input:\nThe history of AI dates back to ancient myths and legends. Modern AI started with the advent of computers.\n\n### Response:",
    "### Instruction:\nTranslate the following sentence into French:\n\n### Input:\nHow are you today?\n\n### Response:",
    "### Instruction:\nExplain the main idea of this article:\n\n### Input:\nA team of researchers discovered a new way to synthesize clean energy.\n\n### Response:",
    "### Instruction:\nWrite a short poem about the seasons based on the following prompt:\n\n### Input:\nThe changing seasons bring joy and wonder.\n\n### Response:",
    "### Instruction:\nRewrite this paragraph in a more formal tone:\n\n### Input:\nI think the project needs some improvements, but it's a great start.\n\n### Response:",
    "### Instruction:\nExplain this technical concept in simple terms:\n\n### Input:\nQuantum computing harnesses quantum mechanics to perform computations.\n\n### Response:",
    "### Instruction:\nProvide a summary of the key points from this document:\n\n### Input:\nThe company's annual report highlighted increased revenue and market growth.\n\n### Response:",
    "### Instruction:\nPropose a new title for the following article:\n\n### Input:\nThe proposal suggests expanding the product line into new markets.\n\n### Response:",
    "### Instruction:\nSolve this easy math problem:\n\n### Input:\nWhat is 12 plus 8?\n\n### Response:",
    "### Instruction:\nIdentify possible corrections for this grammar issue:\n\n### Input:\nI has a apple.\n\n### Response:"
]


test_dataset = Dataset.from_dict({'text': instructions})

def generate_response(model_path, test_dataset):
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, device=0)
    
    batch_size = 1
    num_examples = len(test_dataset)
    total_batches = (num_examples + batch_size - 1) // batch_size
    generated_output = []

    for i in tqdm(range(0, num_examples, batch_size), total=total_batches, desc="Generating text"):
        batch_indices = range(i, min(i + batch_size, num_examples))
        batch = test_dataset.select(batch_indices)
        prompts = [example['text'] for example in batch]

        results = pipe(prompts, max_new_tokens=128)
        
        for result in results:
            generated_text = result[0]['generated_text']
            generated_output.append(generated_text)

    return generated_output

responses_instruct_sentiment = generate_response(model_instruct_sentiment_path, test_dataset)
responses_sentiment_tuned = generate_response(model_sentiment_tuned_path, test_dataset)

results_json = []
for i, instruction in enumerate(instructions):
    results_json.append({
        "instruction": instruction,
        "response_instruct_sentiment": responses_instruct_sentiment[i],
        "response_sentiment_tuned": responses_sentiment_tuned[i]
    })

with open('/workspace/storage/fatemeh/organized_projects/NLP_hw3/results/responses.json', 'w+') as f:
    json.dump(results_json, f, indent=4)

print("Results saved to responses.json")


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.00it/s]
Generating text: 100%|██████████| 10/10 [00:32<00:00,  3.29s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]
Generating text: 100%|██████████| 10/10 [00:39<00:00,  3.91s/it]

Results saved to responses.json



