In [None]:
import os
import json
import numpy as np
import pandas as pd
import re
import string
from collections import Counter
from tqdm import tqdm
from peft import LoraConfig


import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
Gemma2ForCausalLM
)
from trl import SFTTrainer
from datasets import load_dataset, Dataset, DatasetDict
from accelerate import Accelerator
import peft
from langchain.llms import HuggingFacePipeline

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

In [None]:
# #QLoRA
# lora_config = LoraConfig(
#     r=6,#멀티헤드어텐션 헤드 개수
#     lora_alpha = 8, #어텐션 계수 스케일
#     lora_dropout = 0.05, #드롭아웃 비율
#     target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
#     task_type="CAUSAL_LM",
# )

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )

In [None]:
model_id = "google/gemma-2-2b-it"

os.environ["HF_TOKEN"] = ""

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True,
    token=os.environ["HF_TOKEN"]
)



# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True
)
tokenizer.use_default_system_prompt = False

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.1
)

# ✅ 3. LangChain LLM wrapper
llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
# 벡터 임베딩 및 검색
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

### 데이터 부르기

In [None]:
import zipfile

with zipfile.ZipFile('open (1).zip', 'r') as zip_ref:
    zip_ref.extractall('data')


In [None]:
file_path = 'data/test.csv'
test_data = pd.read_csv(file_path)

df = test_data.sample(frac=1).reset_index(drop=True)

df['context'][0]

In [None]:
df

In [None]:
generated_answers = []
true_answers = []

prompt_template = """<start_of_turn>user
다음은 뉴스 기사입니다. 기사를 잘 읽고, 질문에 단답형으로 정확하게 답해주세요.

기사:
{context}

질문:
{question}

답변은 한두 단어로 간결하게 해주세요.
<end_of_turn>
<start_of_turn>model
"""


for idx, row in df.iterrows():
    article = row["context"]
    question = row["question"]
    qid = row["id"]

    # 문서 쪼개기 & 벡터화
    docs = text_splitter.create_documents([article])
    vectordb = FAISS.from_documents(docs, embedding_model)

    # QA 체인 구성
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectordb.as_retriever(),
        return_source_documents=False
    )

    # 프롬프트 생성
    prompt = prompt_template.format(context=article, question=question)

    try:
        response = qa_chain({"query": prompt})
        generated = response["result"].strip()
    except Exception as e:
        print(f"Error at index {idx} (ID: {qid}): {e}")
        generated = ""

    generated_answers.append((qid, generated))


# for pred, true in zip(generated_answers, true_answers):
#     if pred.strip() == true.strip():
#         correct += 1

# accuracy = correct / total
# print(f"\n전체 문항 수: {total}")
# print(f"맞춘갯수: {correct}")
# print(f" Exact Match Accuracy: {round(accuracy * 100, 2)}%")

In [None]:
submission_df = pd.DataFrame(generated_answers, columns=["id", "answer"])
submission_df.to_csv("data/submission_pure_rag.csv", index=False, encoding="utf-8-sig")