In [None]:
# MT_Bench dataset
import numpy as np
import transformers
import accelerate
#import vllm
import bitsandbytes
#from vllm import LLM, SamplingParams
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib
from collections import Counter
import subprocess
import json


from sentence_transformers import SentenceTransformer, util

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import time
import threading
import torch
import pynvml
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.profiler import profile, ProfilerActivity
from datasets import load_dataset
import numpy as np
import subprocess
import gc


# Initialize NVML for power measurement
def initialize_nvml():
    pynvml.nvmlInit()

def shutdown_nvml():
    pynvml.nvmlShutdown()

def get_gpu_handle(gpu_index=0):
    return pynvml.nvmlDeviceGetHandleByIndex(gpu_index)

def start_power_monitoring(handle, interval_sec=0.1):
    power_readings = []
    running = True

    def monitor():
        while running:
            power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0  # Convert from mW to W
            timestamp = time.time()
            power_readings.append((timestamp, power))
            time.sleep(interval_sec)

    thread = threading.Thread(target=monitor)
    thread.start()

    def stop():
        nonlocal running
        running = False
        thread.join()

    return power_readings, stop



def calculate_perplexity1(model, inputs):
    #inputs = tokenizer(input_text, return_tensors="pt").to(device)  # Ensure input is on the same device
    with torch.no_grad():
        outputs = model( labels=inputs)
        loss = outputs.loss
        perplexity = torch.exp(loss)
    return perplexity.item()


def calculate_perplexity(model, inputs, attention_mask=None):
    # Assume `inputs` is a tensor directly containing input_ids
    input_ids = inputs  # Directly use inputs if it's a tensor
    labels = input_ids.clone()  # Copy input_ids to use as labels

   # print(attention_mask)
    with torch.no_grad():
        # Pass input_ids and optionally attention_mask to the model
        if attention_mask is not None:
            outputs = model(input_ids=input_ids, labels=labels)
        else:
            outputs = model(input_ids=input_ids, labels=labels)

        loss = outputs.loss
        perplexity = torch.exp(loss)
    
    return perplexity.item()



# Measure energy consumed during inference and FLOPs
def measure_energy_during_inference(handle, inference_function, model, inputs, max_new_tokens=1):
    #print(f"tokens: {max_new_tokens}")
    
    # Start power monitoring
    power_readings, stop_monitoring = start_power_monitoring(handle, interval_sec=0.05)
    #attention_mask=inputs['attention_mask']
    # Start time for inference
    start_time = time.time()

    # Measure FLOPs using PyTorch profiler
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_flops=True, record_shapes=False) as prof:
        with torch.no_grad():
            result = inference_function(inputs['input_ids'],max_new_tokens=max_new_tokens, do_sample=False )#num_beams=1)
    
    end_time = time.time()
    
    # Stop power monitoring
    stop_monitoring()

    # Filter power readings during inference
    power_during_inference = [p for t, p in power_readings if start_time <= t <= end_time]

    # Calculate average power and energy consumed
    if power_during_inference:
        avg_power = sum(power_during_inference) / len(power_during_inference)
        elapsed_time = end_time - start_time
        energy_consumed = avg_power * elapsed_time
    else:
        avg_power = 0
        energy_consumed = 0
        elapsed_time = end_time - start_time

    # Calculate FLOPs
    flops = sum([event.flops for event in prof.key_averages() if event.flops is not None])

    perplexity = calculate_perplexity(model, inputs['input_ids'])

    return energy_consumed, elapsed_time, flops, result, power_during_inference, perplexity



In [None]:
import numpy as np
import torch
from datasets import load_dataset
import pandas as pd

# Map generated text to the corresponding option in ARC
def map_generated_text_to_option(generated_text, choices):
    # ARC choices are usually labeled as 'A', 'B', 'C', 'D'
    option_map = {str(idx): choice for idx, choice in enumerate(choices)}
    if generated_text in option_map:
        return option_map[generated_text]
    return None

# Load the ARC dataset
def load_arc_data(subset="ARC-Challenge"):
    arc_dataset = load_dataset("ai2_arc", subset, split="test")  # Load test split for evaluation
    df_arc = pd.DataFrame({
        'question': arc_dataset['question'],   # The question text
        'choices': arc_dataset['choices']['text'],   # List of possible answer choices
        'answer_key': arc_dataset['answerKey']       # The correct answer (e.g., 'A', 'B', 'C', 'D')
    })
    return df_arc

# Run experiment for ARC dataset
def run_experiment_for_arc(data, bootstrapping, handle, model, tokenizer, max_new_tokens):
    latencies = []
    energy_per_token = []
    energy_per_flops = []
    energy_per_task = []
    throughputs = []
    generated_texts = []
    accuracies = []
    flopslisttotal = []
    energy_over_time = []
    power_over_time = []
    perplexities = []
    
    model.eval()
    for idx, row in data.iterrows():
        # Construct the prompt
        prompt = f"Question: {row['question']}\n"
        for i, choice in enumerate(row['choices']):
            prompt += f"{i}) {choice}\n"
        prompt += "Answer:"

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # Ensure input is on the same device
        text_latencies = []
        text_energy_per_token = []
        text_energy_per_flops = []
        text_energy_per_task = []
        text_throughput = []
        text_generated = []
        correct_predictions = 0  # To calculate accuracy
        floplist = []
        energy = []
        power_inf = []
        perplexity_prompt = []

        for _ in range(bootstrapping):
            energy_consumed, latency, flops, output, power_during_inference, perplexity = measure_energy_during_inference(
                handle, model.generate, model, inputs, max_new_tokens=max_new_tokens
            )
            perplexity_prompt.append(perplexity)
            power_inf.append(power_during_inference)
            energy.append(energy_consumed)
            text_latencies.append(latency)
            output_tokens = output.size(-1) - inputs['input_ids'].size(-1)
            energy_token = energy_consumed / output_tokens if output_tokens > 0 else 0
            text_energy_per_token.append(energy_token)

            energy_flop = energy_consumed / flops if flops > 0 else 0
            text_energy_per_flops.append(energy_flop)
            text_energy_per_task.append(energy_consumed)
            throughput = output_tokens / latency if latency > 0 else 0
            text_throughput.append(throughput)

            # Decode the generated token
            generated_text = tokenizer.decode(output[0][inputs['input_ids'].size(-1):], skip_special_tokens=True)
            generated_text = generated_text.strip()
            text_generated.append(generated_text)

            floplist.append(flops)
            
            # Map the generated text to an option and check correctness
            mapped_answer = map_generated_text_to_option(generated_text, row['choices'])
            if mapped_answer == row['answer_key']:
                correct_predictions += 1

        # Append bootstrapping metrics
        perplexities.append(perplexity_prompt)
        power_over_time.append(power_inf)
        energy_over_time.append(energy)
        flopslisttotal.append(floplist)
        accuracy = correct_predictions / bootstrapping
        accuracies.append(accuracy)
        latencies.append(text_latencies)
        energy_per_token.append(text_energy_per_token)
        energy_per_flops.append(text_energy_per_flops)
        energy_per_task.append(text_energy_per_task)
        throughputs.append(text_throughput)
        generated_texts.append(text_generated)

    overall_accuracy = np.mean(accuracies)
    return (latencies, energy_per_token, energy_per_flops, energy_per_task, throughputs,
            generated_texts, overall_accuracy, flopslisttotal, energy_over_time, power_over_time, perplexities)

# Collect metrics for ARC
def collect_metrics_for_arc(data, bootstrapping, model, tokenizer, max_new_tokens):
    handle = get_gpu_handle(gpu_index=0)

    # Run the experiment
    metrics = run_experiment_for_arc(
        data, bootstrapping, handle, model, tokenizer, max_new_tokens
    )

    # Store metrics in a dictionary
    arc_metrics = {
        "latencies": metrics[0],
        "energy_per_token": metrics[1],
        "energy_per_flops": metrics[2],
        "energy_per_task": metrics[3],
        "throughput": metrics[4],
        "generated_texts": metrics[5],
        "accuracy": metrics[6],
        "flopstotal": metrics[7],
        "energy_over_time": metrics[8],
        "power_over_time": metrics[9],
        "perplexity": metrics[10]
    }

    shutdown_nvml()  # Ensure GPU monitoring is shut down
    return arc_metrics

In [None]:
#categories = semanticdifferent # math computer_science health semanticdifferent
#categories = [math, economics, computer_science, natural_sciences, health, humanities, sociology, engineering]
categories = [humanities_reverse, economics1, health_1manupulated]
#categories = [math,computer_science]

#category_text = "math"

# Bootstrapping iterations
bootstrapping = 1
# max new output tokens
max_new_tokens = 1

#initialize_nvml()

#huggingface-cli login
# HF Access Token
access_token = "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"

# Load model and tokenizer
model_name = [#'facebook/opt-125m',
            "meta-llama/Llama-3.1-8B",  
            "meta-llama/Llama-3.2-1B",
            "meta-llama/Llama-3.2-3B",
            "tiiuae/falcon-7b",
            "tiiuae/falcon-rw-1b",
            "lmsys/vicuna-7b-v1.5"
            #"ProbeMedicalYonseiMAILab/medllama3-v20",
            #"NTQAI/Nxcode-CQ-7B-orpo",
            #"MathLLMs/MathCoder-L-7B"
        ]
#import os
counter = 0
Model_metrics_for_categories = []
for models in model_name:
# Load MMLU data
    #print(i)
    #counter +=1
    model = AutoModelForCausalLM.from_pretrained(models, device_map="auto", use_auth_token=access_token)
    tokenizer = AutoTokenizer.from_pretrained(models, use_auth_token=access_token)

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    #print(data_dict.keys())
    allmetrics = []
    for category in categories:
        data_dict = load_mmlu_data(category)
        initialize_nvml()
        #print(models)
        #model = AutoModelForCausalLM.from_pretrained(models, use_auth_token=access_token)

        #model = AutoModelForCausalLM.from_pretrained(models, device_map="auto", use_auth_token=access_token)
        #tokenizer = AutoTokenizer.from_pretrained(models, use_auth_token=access_token)

        #tokenizer.pad_token = tokenizer.eos_token
        #tokenizer.pad_token_id = tokenizer.eos_token_id

        flop_mmlu_metrics = collect_metrics_for_categories(data_dict, category, bootstrapping, model, tokenizer, max_new_tokens)
        allmetrics.append(flop_mmlu_metrics)
    
    with open(f"{models.replace('/','-').replace('.', '_')}_bootstrapping={bootstrapping}_ARC.json", "w") as json_file:
        json.dump(allmetrics, json_file)
    del model
    del tokenizer
    gc.collect()
    torch.cuda.empty_cache()
    
    
    #for i, metrics in enumerate(allmetrics):
        # Create a filename for each model's metrics
    Model_metrics_for_categories.append(allmetrics)
'''
counter = 0
allmetrics = []
for category in categories:
# Load MMLU data
    #print(i)
    counter +=1
    data_dict = load_mmlu_data(category)
    print(data_dict.keys())
    for models in model_name:
        #model = AutoModelForCausalLM.from_pretrained(models, use_auth_token=access_token)

        model = AutoModelForCausalLM.from_pretrained(models, device_map="auto", use_auth_token=access_token)
        tokenizer = AutoTokenizer.from_pretrained(models, use_auth_token=access_token)
        flop_mmlu_metrics = collect_metrics_for_categories(data_dict, category, bootstrapping, model, tokenizer, max_new_tokens)
        allmetrics.append(flop_mmlu_metrics)

'''