In [50]:
# –£—Å—Ç–∞–Ω–æ–≤–∫–∞ –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã—Ö –±–∏–±–ª–∏–æ—Ç–µ–∫
!pip install  faiss-cpu

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.feature_extraction.text import TfidfVectorizer
import faiss
import numpy as np

# 1. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞ –∏–∑ —Ñ–∞–π–ª–∞ cat-facts.txt
def load_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        facts = [line.strip() for line in f if line.strip()]
    return facts

dataset = load_dataset("/content/–∫–æ—à–∫–∏.txt")
print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(dataset)} —Ñ–∞–∫—Ç–æ–≤ –æ –∫–æ—à–∫–∞—Ö.")

# 2. –°–æ–∑–¥–∞–Ω–∏–µ –ø–æ–∏—Å–∫–æ–≤–æ–≥–æ –º–µ—Ö–∞–Ω–∏–∑–º–∞ (TF-IDF + FAISS)
class Retriever:
    def __init__(self, documents):
        self.documents = documents
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(documents)

        # –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ TF-IDF –º–∞—Ç—Ä–∏—Ü—ã –≤ —Ñ–æ—Ä–º–∞—Ç, –ø–æ–¥—Ö–æ–¥—è—â–∏–π –¥–ª—è FAISS
        tfidf_array = self.tfidf_matrix.toarray().astype('float32')
        self.index = faiss.IndexFlatL2(tfidf_array.shape[1])  # L2 —Ä–∞—Å—Å—Ç–æ—è–Ω–∏–µ
        self.index.add(tfidf_array)

    def retrieve(self, query, top_k=5):
        query_vector = self.vectorizer.transform([query]).toarray().astype('float32')
        distances, indices = self.index.search(query_vector, top_k)
        return [self.documents[i] for i in indices[0]]

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –ø–æ–∏—Å–∫–æ–≤–æ–≥–æ –º–µ—Ö–∞–Ω–∏–∑–º–∞
retriever = Retriever(dataset)

# 3. –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –≥–µ–Ω–µ—Ä–∞—Ç–∏–≤–Ω–æ–π –º–æ–¥–µ–ª–∏
class Generator:
    def __init__(self, model_name="t5-small"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

    def generate(self, context, question):
        input_text = f"question: {question} context: {context}"
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        outputs = self.model.generate(inputs["input_ids"], max_length=150, num_beams=4, early_stopping=True)
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –≥–µ–Ω–µ—Ä–∞—Ç–∏–≤–Ω–æ–π –º–æ–¥–µ–ª–∏
generator =  T5ForConditionalGeneration.from_pretrained("t5-small")

# 4. –û–±—ä–µ–¥–∏–Ω–µ–Ω–∏–µ –ø–æ–∏—Å–∫–∞ –∏ –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏
def rag_pipeline(question, retriever, generator, top_k=5):
    # –ü–æ–∏—Å–∫ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
    retrieved_docs = retriever.retrieve(question, top_k=top_k)
    context = " ".join(retrieved_docs)

    # –ì–µ–Ω–µ—Ä–∞—Ü–∏—è –æ—Ç–≤–µ—Ç–∞
    response = generator.generate(context, question)
    return response, retrieved_docs

# 5. –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —Å–∏—Å—Ç–µ–º—ã
question = "—Å–∫–æ–ª—å–∫–æ –∫–æ—à–∫–∏ –º—É—Ä–ª—ã–∫–∞—é—Ç?"
response, retrieved_docs = rag_pipeline(question, retriever, generator)

print(f"–í–æ–ø—Ä–æ—Å: {question}")
print(f"–†–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã: {retrieved_docs}")
print(f"–û—Ç–≤–µ—Ç: {response}")

–ó–∞–≥—Ä—É–∂–µ–Ω–æ 8 —Ñ–∞–∫—Ç–æ–≤ –æ –∫–æ—à–∫–∞—Ö.


AttributeError: 'str' object has no attribute 'update'

In [29]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m487.4/487.4 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [34]:
# –£—Å—Ç–∞–Ω–æ–≤–∫–∞ –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã—Ö –±–∏–±–ª–∏–æ—Ç–µ–∫
# pip install transformers torch datasets scikit-learn

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

# 1. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞ –∏–∑ —Ñ–∞–π–ª–∞ cat-facts.txt
def load_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        facts = [line.strip() for line in f if line.strip()]
    return facts

dataset = load_dataset("–∫–æ—à–∫–∏.txt")
print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(dataset)} —Ñ–∞–∫—Ç–æ–≤ –æ –∫–æ—à–∫–∞—Ö.")

# 2. –°–æ–∑–¥–∞–Ω–∏–µ –ø–∞—Ä –≤–æ–ø—Ä–æ—Å-–æ—Ç–≤–µ—Ç
def create_qa_pairs(facts):
    qa_pairs = []
    for fact in facts:
        # –ü—Ä–æ—Å—Ç–æ–π —Å–ø–æ—Å–æ–± —Å–æ–∑–¥–∞–Ω–∏—è –≤–æ–ø—Ä–æ—Å–æ–≤: –¥–æ–±–∞–≤–ª—è–µ–º –ø—Ä–µ—Ñ–∏–∫—Å –∫ —Ñ–∞–∫—Ç–∞–º
        question = f"–ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?"
        answer = fact
        qa_pairs.append({"question": question, "answer": answer})
    return qa_pairs

qa_pairs = create_qa_pairs(dataset)

# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –Ω–∞ –æ–±—É—á–∞—é—â—É—é –∏ —Ç–µ—Å—Ç–æ–≤—É—é –≤—ã–±–æ—Ä–∫–∏
train_data, test_data = train_test_split(qa_pairs, test_size=0.1, random_state=42)

# 3. –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –º–æ–¥–µ–ª–∏ T5
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_data(data):
    inputs = [f"question: {item['question']} context: {item['answer']}" for item in data]
    targets = [item["answer"] for item in data]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length", return_tensors="pt")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = Dataset.from_dict(preprocess_data(train_data))
test_dataset = Dataset.from_dict(preprocess_data(test_data))

# 4. –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –º–æ–¥–µ–ª–∏ T5
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# 5. –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –æ–±—É—á–µ–Ω–∏—è
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=40,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
)

# 6. –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

# 7. –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
model.save_pretrained("./finetuned_t5")
tokenizer.save_pretrained("./finetuned_t5")

# 8. –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
def generate_response(question, context, model, tokenizer):
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=4, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
question = "—Å–∫–æ–ª—å–∫–æ –º–æ–∂–µ—Ç —Å–ø–∞—Ç—å –∫–æ—à–∫–∞?"
context = "–ö–æ—à–∫–∏ –º–æ–≥—É—Ç —Å–ø–∞—Ç—å –¥–æ 16 —á–∞—Å–æ–≤ –≤ –¥–µ–Ω—å."
response = generate_response(question, context, model, tokenizer)
print(f"–í–æ–ø—Ä–æ—Å: {question}")
print(f"–ö–æ–Ω—Ç–µ–∫—Å—Ç: {context}")
print(f"–û—Ç–≤–µ—Ç: {response}")

–ó–∞–≥—Ä—É–∂–µ–Ω–æ 8 —Ñ–∞–∫—Ç–æ–≤ –æ –∫–æ—à–∫–∞—Ö.




Epoch,Training Loss,Validation Loss
1,No log,11.233547
2,No log,9.658791
3,No log,8.258592
4,No log,6.959317
5,No log,5.720098
6,No log,4.591546
7,No log,3.564067
8,No log,2.626657
9,No log,1.823582
10,No log,1.184372


KeyboardInterrupt: 

In [37]:
print(qa_pairs)

[{'question': '–ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?', 'answer': '–°–∫–æ–ª—å–∫–æ –∫–æ—à–∫–∏ –º—É—Ä–ª—ã–∫–∞—é—Ç?'}, {'question': '–ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?', 'answer': '–∫–æ—à–∫–∏ –º—É—Ä–ª—ã–∫–∞—é—Ç 8 —á–∞—Å–æ–≤'}, {'question': '–ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?', 'answer': '—Å–∫–æ–ª—å–∫–æ –≥–ª–∞–∑ —É –∫–æ—à–∫–∏?'}, {'question': '–ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?', 'answer': '—É –∫–æ—à–µ–∫ 2 –≥–ª–∞–∑–∞'}, {'question': '–ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?', 'answer': '—Å–∫–æ–ª—å–∫–æ —Ö–≤–æ—Å—Ç–æ–≤ —É –∫–æ—à–∫–∏?'}, {'question': '–ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?', 'answer': '—É –∫–æ—à–∫–∏ 1 —Ö–≤–æ—Å—Ç'}, {'question': '–ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?', 'answer': '—á–µ–≥–æ –º–Ω–æ–≥–æ

In [35]:
model.save_pretrained("./finetuned_t5")
tokenizer.save_pretrained("./finetuned_t5")

('./finetuned_t5/tokenizer_config.json',
 './finetuned_t5/special_tokens_map.json',
 './finetuned_t5/spiece.model',
 './finetuned_t5/added_tokens.json')

In [46]:
question = "–ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?"
context = "—É –∫–æ—à–µ–∫ –º–Ω–æ–≥–æ –¥–æ–±—Ä–∞"
response = generate_response(question, context, model, tokenizer)
print(f"–í–æ–ø—Ä–æ—Å: {question}")
print(f"–ö–æ–Ω—Ç–µ–∫—Å—Ç: {context}")
print(f"–û—Ç–≤–µ—Ç: {response}")

–í–æ–ø—Ä–æ—Å: –ö–∞–∫–∏–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ —Ñ–∞–∫—Ç—ã –æ –∫–æ—à–∫–∞—Ö –≤—ã –∑–Ω–∞–µ—Ç–µ?
–ö–æ–Ω—Ç–µ–∫—Å—Ç: —É –∫–æ—à–µ–∫ –º–Ω–æ–≥–æ –¥–æ–±—Ä–∞
–û—Ç–≤–µ—Ç: 
