In [None]:
import time
import torch
import pandas as pd
import psutil
import GPUtil
import pynvml
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

# -------------------------
# Load model + tokenizer
# -------------------------
model_name = "deepseek-ai/deepseek-llm-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda"
)
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

# -------------------------
# Init NVML for GPU metrics
# -------------------------
pynvml.nvmlInit()
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(0)

def get_gpu_utilization():
    util = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle)
    mem = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
    return util.gpu, util.memory, mem.used / 1024**2  # % compute, % mem, MB

# -------------------------
# Load dataset
# -------------------------
import pandas as pd

# Load the CSV file
file_path = 'Noisy-Denoised_QuestionPairs[new].csv'
df = pd.read_csv(file_path)

# Drop the 'denoised_question' column
df.drop(columns=['denoised_question'], inplace=True)

# Save the updated CSV file
df.to_csv('Noisy-Denoised_QuestionPairs[deepseek].csv', index=False)

print("Column 'denoised_question' has been dropped and the updated file is saved as 'Noisy-Denoised_QuestionPairs[deepseek].csv'.")


# -------------------------
# Denoising + profiling
# -------------------------
def denoise_question(question):
    messages = [
        {"role": "user", "content": 
            f"""You are an expert at denoising text. Your task is to provide the denoised version of a given noisy text or question. Follow these instructions:

1. Return only the denoised version of the text or question.
2. Do not provide explanations or additional words.
3. Do not answer the question or alter its intent.
4. Maintain the question format if the input is a question.
5. Avoid presenting the answer in assertive form.
\n '{question}' """}
    ]
    input_tensor = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    )

    # GPU/CPU usage before
    cpu_before = psutil.cpu_percent(interval=None)
    gpu_util_before, gpu_mem_util_before, gpu_mem_used_before = get_gpu_utilization()

    start = time.time()
    outputs = model.generate(input_tensor.to(model.device), max_new_tokens=100)
    torch.cuda.synchronize()
    end = time.time()

    # GPU/CPU usage after
    cpu_after = psutil.cpu_percent(interval=None)
    gpu_util_after, gpu_mem_util_after, gpu_mem_used_after = get_gpu_utilization()

    # Tokens
    input_tokens = input_tensor.shape[1]
    output_tokens = outputs.shape[1] - input_tokens
    total_tokens = input_tokens + output_tokens

    result = tokenizer.decode(outputs[0][input_tokens:], skip_special_tokens=True)

    return {
        "denoised": result.strip(),
        "time_sec": end - start,
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "total_tokens": total_tokens,
        "cpu_usage_%": (cpu_before + cpu_after) / 2,
        "gpu_util_%": (gpu_util_before + gpu_util_after) / 2,
        "gpu_mem_util_%": (gpu_mem_util_before + gpu_mem_util_after) / 2,
        "gpu_mem_MB": gpu_mem_used_after
    }

# -------------------------
# Run profiling
# -------------------------
timings = []
denoised_questions = []
total_time = 0
total_tokens = 0

for modified in tqdm(df['modified_question'], desc="Profiling Denoising"):
    metrics = denoise_question(modified)
    denoised_questions.append(metrics["denoised"])
    timings.append({"question": modified, **metrics})
    total_time += metrics["time_sec"]
    total_tokens += metrics["total_tokens"]

# -------------------------
# Summary stats
# -------------------------
n_samples = len(df)
avg_time = total_time / n_samples
avg_tokens = total_tokens / n_samples
tokens_per_sec = total_tokens / total_time

# FLOPs estimate: ~2 × (#params) × tokens
n_params = sum(p.numel() for p in model.parameters())
avg_flops = 2 * n_params * avg_tokens

# Cost estimate: assume $1.50/hr for an A100 (adjust if needed)
gpu_hourly_price = 1.50
gpu_hours = total_time / 3600
gpu_cost = gpu_hourly_price * gpu_hours

print("\n===== Inference Cost Report =====")
print(f"Samples processed      : {n_samples}")
print(f"Total time (s)         : {total_time:.2f}")
print(f"Avg time per sample    : {avg_time:.3f} s")
print(f"Avg tokens per sample  : {avg_tokens:.1f}")
print(f"Tokens/sec             : {tokens_per_sec:.1f}")
print(f"Model parameters       : {n_params/1e9:.2f} B")
print(f"Est. FLOPs/sample      : {avg_flops/1e12:.2f} TFLOPs")
print(f"GPU memory usage (MB)  : {timings[0]['gpu_mem_MB']:.1f}")
print(f"GPU utilization (avg%) : {sum(t['gpu_util_%'] for t in timings)/n_samples:.1f}")
print(f"CPU utilization (avg%) : {sum(t['cpu_usage_%'] for t in timings)/n_samples:.1f}")
print(f"Est. GPU runtime (h)   : {gpu_hours:.3f}")
print(f"Est. GPU cost ($)      : {gpu_cost:.2f}")

# -------------------------
# Save results
# -------------------------
profiled_df = pd.DataFrame(timings)
profiled_df.to_csv("profiling_results_detailed.csv", index=False)

df["denoised_question"] = denoised_questions
df.to_csv("Noisy-Denoised_QuestionPairs[deepseek]_profiled.csv", index=False)
