## Импорт библиотек

In [93]:
# TRUST ME
import warnings
warnings.filterwarnings('ignore')

# core
import torch
import numpy as np

# general utils
import re # regex
import json
from langdetect import detect # for search query recognition
from datasets import Dataset
from concurrent.futures import ThreadPoolExecutor, as_completed # parallel

# Pdf scraping / Rag
from langchain.document_loaders import PDFPlumberLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import BaseLLM
from langchain.schema import LLMResult, Generation
from langchain.chains import RetrievalQA

# URL utils
import requests # api requests
from readability import Document # Url data extraction
from bs4 import BeautifulSoup # URL parsing

# models
from sentence_transformers import SentenceTransformer
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import (AlbertTokenizer, 
                          AlbertForSequenceClassification, 
                          Trainer, # Only for ALBERT training
                          TrainingArguments, # Only for ALBERT training
                          AutoModelForCausalLM, 
                          AutoTokenizer, 
                          pipeline
                          )


# URL handling

In [2]:
def replace_url(text):
    url_pattern = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)?'
    return re.sub(url_pattern, '[URL]', text)

In [3]:
url_prompt = "Check http://example.com and http://example.com for ideas."
replace_url(url_prompt)

'Check [URL] and [URL] for ideas.'

# Zero-shot with albert

## Data prep

In [4]:
# Define short class names
# Search Query, URL query, User complaints, Human tech help, irrelevant topic
class_names = ["SQ", "UQ", "CP", "SR", "IR"]

In [5]:
train_data = {
    "text": [
        "Как найти хорошую книгу?", "Где купить дешевые билеты?", "Что посмотреть в выходные?",  # SQ
        "Смотри [URL], там новости.", "На [URL] скидки, проверь.", "Это с [URL], что скажешь?",  # UQ
        "Ваша доставка ужасна!", "Почему все так медленно?", "Товар сломан, это возмутительно!",  # CP
        "Помогите настроить роутер.", "Свяжите меня с поддержкой.", "Как позвонить в техподдержку?",  # SR
        "Какая погода завтра?", "Сколько лет Земле?", "Ты любишь кофе?"  # IR
    ],
    "label": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]  # Class indices
}

In [6]:
# Create dataset
dataset = Dataset.from_dict(train_data)

## Load

In [7]:
albert_model = AlbertForSequenceClassification.from_pretrained("./fine_tuned_albert")
albert_tokenizer = AlbertTokenizer.from_pretrained("./fine_tuned_albert")

## Model init (in case of train)

In [8]:
# albert_tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
# albert_model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=len(class_names))

## Train

In [9]:
# Tokenize dataset
def tokenize_function(examples):
    return albert_tokenizer(examples["text"], padding="max_length", truncation=True)

In [10]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 15/15 [00:00<00:00, 243.18 examples/s]


Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 15
})

In [11]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=,
#     per_device_train_batch_size=4,
#     logging_dir="./logs",
# )

In [12]:
# trainer = Trainer(
#     model=albert_model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
# )

In [13]:
# # Train the model
# trainer.train()

## Save

In [14]:
# albert_model.save_pretrained("./fine_tuned_albert")
# albert_tokenizer.save_pretrained("./fine_tuned_albert")

In [15]:
def classify_query(query):
    inputs = albert_tokenizer(query, return_tensors="pt")
    outputs = albert_model(**inputs)
    predicted_class = torch.argmax(outputs.logits).item()
    return class_names[predicted_class]

In [16]:
classify_query("На [URL] скидки, проверь."), classify_query("Где купить дешевые билеты?")

('UQ', 'SQ')

# YandexGPT prompt -> search query

In [13]:
def generate_search_query(user_request):
    # Create the prompt with the user's request
    prompt = f"Создай поисковой запрос для запроса пользователя: '{user_request}'"
    
    # Define the Ollama API endpoint
    url = "http://localhost:11434/api/generate"
    
    # Prepare the request payload
    data = {
        "model": "yandex/YandexGPT-5-Lite-8B-instruct-GGUF:latest",  # Specify YandexGPT as the model
        "prompt": prompt       # The instruction for YandexGPT
    }
    
    # Send the request to Ollama API with streaming enabled
    response = requests.post(url, json=data, stream=True)
    
    # Collect the generated search query from the streamed response
    search_query = ""
    for line in response.iter_lines():
        if line:
            json_response = json.loads(line)
            if "response" in json_response:
                search_query += json_response["response"]
            if json_response.get("done", False):
                break
    
    # Return the cleaned-up search query
    return search_query.strip()

In [16]:
user_request = "Хочу телефон с крутой камерой"
try:
    search_query = generate_search_query(user_request)
except:
    search_query = '«смартфоны с хорошей камерой»'
search_query

'«смартфоны с хорошей камерой»'

# Search engine

In [None]:
def search_with_google(query, num_results=5):
    """
    Retrieve search results from Google Custom Search JSON API based on a query.

    Args:
        query (str): The search query generated by YandexGPT (e.g., "best camera phones 2023").
        api_key (str): Your Google API key.
        cse_id (str): Your Custom Search Engine ID.
        num_results (int): Number of results to return (default: 5, max: 5 per API call).

    Returns:
        list: A list of URLs from the search results, or an empty list if the request fails.
    """
    # Detect the language of the query for more relevant results
    try:
        lang = detect(query)
    except Exception:
        lang = "en"  # Default to English if detection fails

    # Map detected language to Google's language restriction codes
    lang_map = {"en": "lang_en", "ru": "lang_ru"}  # Add more languages as needed
    lang_code = lang_map.get(lang, "lang_en")  # Default to English

    # API endpoint
    
    url = "https://www.searchapi.io/api/v1/search"
    api_key = "vRZHJro7avmqfxKi4hB9bhry" # searchAPI
    
    # Request parameters
    params = {
        "engine": "google",
        "q": query,          # Search query
        "api_key": api_key,      # API key
        "num": num_results+1,  # Number of results
        "lr": lang_code      # Language restriction
    }

    # Send the GET request
    try:
        response = requests.get(url, params=params)
        search_json = response.json()
        urls = [position['link'] for position in search_json['organic_results']]
        return urls

    except requests.RequestException as e:
        print(f"Search API error: {e}")
        return []  # Return empty list on failure

In [6]:
# urls = search_with_google(search_query, 5)
urls = ['https://skillbox.ru/media/design/smartphone-with-a-good-camera/',
 'https://hi-tech.mail.ru/review/106257-smartfony-s-khoroshej-kameroj/',
 'https://quke.ru/blog/article/top-20-kamerofonov-v-2025-godu?srsltid=AfmBOoqfjCQT2kJ-di5Wa98g_bEjoKmRYLJipWsKdhMp6YgdEnAIH3Yw',
 'https://www.eldorado.ru/c/smartfony/tag/horoshaya-kamera/']
urls

['https://skillbox.ru/media/design/smartphone-with-a-good-camera/',
 'https://hi-tech.mail.ru/review/106257-smartfony-s-khoroshej-kameroj/',
 'https://quke.ru/blog/article/top-20-kamerofonov-v-2025-godu?srsltid=AfmBOoqfjCQT2kJ-di5Wa98g_bEjoKmRYLJipWsKdhMp6YgdEnAIH3Yw',
 'https://www.eldorado.ru/c/smartfony/tag/horoshaya-kamera/']

# URL information extracting

In [3]:
def fetch_and_parse(url, timeout=10, max_chars=10000):
    """
    Fetch a web page, extract its main content, and clean the text.

    Args:
        url (str): The URL of the web page to parse.
        timeout (int): Timeout for the request in seconds (default: 10).
        max_chars (int): Maximum number of characters to keep (default: 10,000).

    Returns:
        dict: A dictionary with the URL and cleaned text, or None if an error occurs.
    """
    try:
        # Set a user agent to avoid being blocked
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Extract main content with readability
        doc = Document(response.text)
        summary = doc.summary()

        # Parse and clean the text
        soup = BeautifulSoup(summary, 'html.parser')
        text = soup.get_text()
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
        text = text[:max_chars]  # Limit text length

        return {"url": url, "text": text}

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
    except Exception as e:
        print(f"Error parsing {url}: {e}")
        return None

In [4]:
def fetch_and_parse_parallel(urls, max_workers=5, timeout=10, max_chars=10000):
    """
    Parse multiple URLs concurrently.

    Args:
        urls (list): List of URLs to parse.
        max_workers (int): Maximum number of concurrent threads (default: 5).
        timeout (int): Timeout for each request (default: 10).
        max_chars (int): Maximum characters per page (default: 10,000).

    Returns:
        list: List of dictionaries with URL and cleaned text.
    """
    parsed_contents = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {executor.submit(fetch_and_parse, url, timeout, max_chars): url for url in urls}
        for future in as_completed(future_to_url):
            content = future.result()
            if content:
                parsed_contents.append(content)
    return parsed_contents

In [9]:
parsed_contents = fetch_and_parse_parallel(urls, max_workers=5)

Error fetching https://www.eldorado.ru/c/smartfony/tag/horoshaya-kamera/: 503 Server Error: Service Temporarily Unavailable for url: https://www.eldorado.ru/c/smartfony/tag/horoshaya-kamera/


In [10]:
for content in parsed_contents:
        print(f"URL: {content['url']}")
        print(f"Text: {content['text'][:100]}...")  # Show first 100 characters
        print("-" * 50)

URL: https://skillbox.ru/media/design/smartphone-with-a-good-camera/
Text: Изображение: RealmeДисплей: IPS LCD, 6,72".Основная камера: двойная камера, 108 Мп + 2 Мп.Фронтальна...
--------------------------------------------------
URL: https://hi-tech.mail.ru/review/106257-smartfony-s-khoroshej-kameroj/
Text: Дисплей6,8-дюймовый Dynamic LTPO AMOLED 2X (120 Гц, HDR10+, AOD, до 2600 нит) с разрешением 1440x308...
--------------------------------------------------
URL: https://quke.ru/blog/article/top-20-kamerofonov-v-2025-godu?srsltid=AfmBOoqfjCQT2kJ-di5Wa98g_bEjoKmRYLJipWsKdhMp6YgdEnAIH3Yw
Text: При таком большом количестве вариантов смартфонов на рынке достаточно сложно решить, какой из них лу...
--------------------------------------------------


# Rag from PDF

In [94]:
class YandexGPTLLM(BaseLLM):
    def _generate(self, prompts, stop=None):
        # Define the Ollama API endpoint
        url = "http://localhost:11434/api/generate"
        
        # Prepare the request payload
        data = {
            "model": "yandex/YandexGPT-5-Lite-8B-instruct-GGUF:latest",
            "prompt": prompts[0]  # Take the first prompt (adjust if multiple prompts are needed)
        }
        
        # Send the request with streaming enabled
        response = requests.post(url, json=data, stream=True)
        
        # Collect the generated text from the streamed response
        generated_text = ""
        for line in response.iter_lines():
            if line:
                try:
                    json_response = json.loads(line)
                    if "response" in json_response:
                        generated_text += json_response["response"]
                        print(json_response["response"], end='')
                    if json_response.get("done", False):
                        break
                except json.JSONDecodeError as e:
                    print(f"Failed to decode JSON: {e}")
        
        generation = Generation(text=generated_text)
        return LLMResult(generations=[[generation]])

    def _llm_type(self):
        return "yandex_gpt"

In [66]:
loader = PDFPlumberLoader("knowledge_base.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

CropBox missing from /Page, defaulting to MediaBox


In [67]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [68]:
vector_store = FAISS.from_documents(chunks, embeddings)

In [95]:
llm = YandexGPTLLM()

In [96]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
rag_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [None]:
query = "Какой телефон имеет лучшую камеру?"  # "Which phone has the best camera?"
result = rag_chain({"query": query})
print("\nОтвет:", result["result"])

 Xiaomi 14 Ultra и Samsung Galaxy Z Flip 7 лидируют по качеству камеры.Ответ:  Xiaomi 14 Ultra и Samsung Galaxy Z Flip 7 лидируют по качеству камеры.
