In [2]:
import json
import re
import time
import requests
from typing import Dict, List, Optional, Any
from datetime import datetime
from urllib.parse import quote_plus
import pandas as pd
from dataclasses import dataclass

print("Установка необходимых библиотек...")
import subprocess
import sys

required_packages = [
    'requests',
    'pandas',
    'lxml',
    'beautifulsoup4'
]

for package in required_packages:
    try:
        __import__(package.replace('-', '_'))
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])

from bs4 import BeautifulSoup

@dataclass
class ProductItem:
    title: str
    price: float
    url: str
    seller: str
    rating: Optional[float]
    reviews_count: Optional[int]
    timestamp: datetime

class OZONSearchAgent:
    def __init__(self):
        self.base_url = "https://www.ozon.ru"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    def normalize_price(self, price_text: str) -> float:
        if not price_text:
            return 0.0

        price_text = price_text.replace(' ', '').replace('₽', '').replace('руб.', '').replace(',', '.')

        numbers = re.findall(r'\d+\.?\d*', price_text)
        if numbers:
            return float(numbers[0])
        return 0.0

    def search_product(self, product_name: str, max_results: int = 20) -> List[ProductItem]:
        print(f"Поиск товара: {product_name}")
        print(f"Платформа: OZON")

        search_url = f"{self.base_url}/search/?text={quote_plus(product_name)}&from_global=true"

        try:
            response = self.session.get(search_url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            products = []

            product_cards = soup.find_all('div', {'class': re.compile(r'tile-root')})

            if not product_cards:
                product_cards = soup.find_all('div', {'class': re.compile(r'product-card')})

            if not product_cards:
                product_cards = soup.find_all('div', {'data-widget': 'searchResultsV2'})
                if product_cards:
                    inner_soup = BeautifulSoup(str(product_cards[0]), 'html.parser')
                    product_cards = inner_soup.find_all('div', {'class': re.compile(r'widget-search-result-container')})

            for i, card in enumerate(product_cards[:max_results]):
                try:
                    title_elem = card.find(['a', 'span', 'div'], {'class': re.compile(r'title|name|heading')})
                    if not title_elem:
                        title_elem = card.find(['a', 'span'], text=re.compile(r'.{10,}'))

                    title = title_elem.get_text(strip=True) if title_elem else "Без названия"

                    price_elem = card.find(['span', 'div'], {'class': re.compile(r'price|currency|cost')})
                    if not price_elem:
                        price_elem = card.find(text=re.compile(r'\d+\s*₽|\d+\s*руб'))

                    price_text = price_elem.get_text(strip=True) if price_elem else "0"
                    price = self.normalize_price(price_text)

                    link_elem = card.find('a', href=True)
                    if link_elem and 'href' in link_elem.attrs:
                        product_url = link_elem['href']
                        if not product_url.startswith('http'):
                            product_url = self.base_url + product_url
                    else:
                        product_url = search_url

                    seller_elem = card.find(['span', 'div'], {'class': re.compile(r'seller|shop|store')})
                    seller = seller_elem.get_text(strip=True) if seller_elem else "OZON"

                    rating_elem = card.find(['span', 'div'], {'class': re.compile(r'rating|stars')})
                    rating = None
                    if rating_elem:
                        rating_text = rating_elem.get_text(strip=True)
                        rating_match = re.search(r'[\d\.]+', rating_text)
                        if rating_match:
                            rating = float(rating_match.group())

                    reviews_elem = card.find(['span', 'div'], {'class': re.compile(r'reviews|feedbacks')})
                    reviews_count = None
                    if reviews_elem:
                        reviews_text = reviews_elem.get_text(strip=True)
                        reviews_match = re.search(r'\d+', reviews_text)
                        if reviews_match:
                            reviews_count = int(reviews_match.group())

                    product = ProductItem(
                        title=title,
                        price=price,
                        url=product_url,
                        seller=seller,
                        rating=rating,
                        reviews_count=reviews_count,
                        timestamp=datetime.now()
                    )

                    products.append(product)

                    print(f"  [{i+1}] {title[:60]}...")
                    print(f"      Цена: {price} ₽ | Продавец: {seller}")
                    if rating:
                        print(f"      Рейтинг: {rating:.1f}")

                except Exception as card_error:
                    continue

            if not products:
                print("Товары не найдены в HTML, использую альтернативный парсинг...")
                products = self.alternative_parse(response.text, max_results)

            return products

        except requests.RequestException as e:
            print(f"Ошибка сети: {e}")
            return []
        except Exception as e:
            print(f"Ошибка поиска: {e}")
            return []

    def alternative_parse(self, html_content: str, max_results: int) -> List[ProductItem]:
        products = []

        soup = BeautifulSoup(html_content, 'html.parser')

        script_tags = soup.find_all('script', text=re.compile(r'window.__APP__'))

        for script in script_tags:
            script_text = script.string
            if script_text:
                try:
                    json_match = re.search(r'window\.__APP__\s*=\s*({.*?});', script_text, re.DOTALL)
                    if json_match:
                        json_str = json_match.group(1)
                        data = json.loads(json_str)

                        if 'state' in data and 'trackingPayloads' in data['state']:
                            for payload in data['state']['trackingPayloads'].values():
                                if isinstance(payload, dict) and 'items' in payload:
                                    for item in payload['items']:
                                        if len(products) >= max_results:
                                            break

                                        try:
                                            title = item.get('name', '') or item.get('title', '')
                                            if not title:
                                                continue

                                            price_info = item.get('price', '') or item.get('finalPrice', '')
                                            if isinstance(price_info, dict):
                                                price_text = str(price_info.get('price', '') or price_info.get('finalPrice', ''))
                                            else:
                                                price_text = str(price_info)

                                            price = self.normalize_price(price_text)

                                            product_url = item.get('link', '') or item.get('url', '')
                                            if product_url and not product_url.startswith('http'):
                                                product_url = self.base_url + product_url

                                            seller = item.get('sellerName', '') or item.get('shopName', 'OZON')

                                            rating = item.get('rating', None)
                                            if rating:
                                                rating = float(rating)

                                            reviews_count = item.get('reviewsCount', None)

                                            product = ProductItem(
                                                title=title,
                                                price=price,
                                                url=product_url,
                                                seller=seller,
                                                rating=rating,
                                                reviews_count=reviews_count,
                                                timestamp=datetime.now()
                                            )

                                            products.append(product)

                                        except Exception as e:
                                            continue

                except Exception as e:
                    continue

        return products[:max_results]

    def get_cheapest_product(self, products: List[ProductItem]) -> Optional[ProductItem]:
        if not products:
            return None

        valid_products = [p for p in products if p.price > 0]

        if not valid_products:
            return None

        return min(valid_products, key=lambda x: x.price)

    def get_detailed_info(self, product_url: str) -> Dict[str, Any]:
        try:
            response = self.session.get(product_url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            details = {
                'url': product_url,
                'scraped_at': datetime.now().isoformat()
            }

            description_elem = soup.find('div', {'class': re.compile(r'description|specification')})
            if description_elem:
                details['description'] = description_elem.get_text(strip=True)[:500] + "..."
            else:
                details['description'] = None

            specifications = {}
            spec_sections = soup.find_all('div', {'class': re.compile(r'specification|characteristics')})

            for section in spec_sections[:5]:
                rows = section.find_all(['dl', 'div', 'tr'])
                for row in rows:
                    try:
                        key_elem = row.find(['dt', 'span', 'div'], {'class': re.compile(r'name|key|title')})
                        value_elem = row.find(['dd', 'span', 'div'], {'class': re.compile(r'value|content')})

                        if key_elem and value_elem:
                            key = key_elem.get_text(strip=True).rstrip(':')
                            value = value_elem.get_text(strip=True)
                            if key and value:
                                specifications[key] = value
                    except:
                        continue

            details['specifications'] = specifications

            reviews_elem = soup.find('span', {'class': re.compile(r'reviews-count|feedbacks-count')})
            if reviews_elem:
                details['reviews_count'] = reviews_elem.get_text(strip=True)
            else:
                details['reviews_count'] = "0"

            return details

        except Exception as e:
            print(f"Ошибка получения деталей: {e}")
            return {'url': product_url, 'error': str(e)}

class ProductSearchAnalyzer:
    def __init__(self):
        self.search_history = []

    def analyze_results(self, products: List[ProductItem]) -> Dict[str, Any]:
        if not products:
            return {}

        valid_products = [p for p in products if p.price > 0]

        if not valid_products:
            return {}

        prices = [p.price for p in valid_products]

        analysis = {
            'total_found': len(products),
            'valid_products': len(valid_products),
            'price_range': {
                'min': min(prices),
                'max': max(prices),
                'average': sum(prices) / len(prices),
                'median': sorted(prices)[len(prices)//2] if prices else 0
            },
            'cheapest_product': {
                'title': valid_products[0].title,
                'price': valid_products[0].price,
                'seller': valid_products[0].seller
            },
            'top_sellers': self._extract_top_sellers(valid_products),
            'timestamp': datetime.now().isoformat()
        }

        if valid_products:
            cheapest = min(valid_products, key=lambda x: x.price)
            analysis['cheapest_product'] = {
                'title': cheapest.title,
                'price': cheapest.price,
                'seller': cheapest.seller,
                'url': cheapest.url
            }

        return analysis

    def _extract_top_sellers(self, products: List[ProductItem], top_n: int = 3) -> List[Dict]:
        seller_counts = {}

        for product in products:
            seller = product.seller
            seller_counts[seller] = seller_counts.get(seller, 0) + 1

        sorted_sellers = sorted(seller_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]

        top_sellers = []
        for seller, count in sorted_sellers:
            seller_products = [p for p in products if p.seller == seller]
            if seller_products:
                min_price = min(p.price for p in seller_products)
                avg_price = sum(p.price for p in seller_products) / len(seller_products)

                top_sellers.append({
                    'seller': seller,
                    'product_count': count,
                    'min_price': min_price,
                    'avg_price': avg_price
                })

        return top_sellers

def main_search_task(search_query: str = "кастет"):
    print("="*70)
    print("АГЕНТ ПОИСКА ТОВАРОВ НА OZON")
    print("="*70)

    print(f"\nЗапуск поиска по запросу: '{search_query}'")

    agent = OZONSearchAgent()
    analyzer = ProductSearchAnalyzer()

    try:
        print("\nВыполнение поиска на OZON...")
        products = agent.search_product(search_query, max_results=15)

        if not products:
            print("Товары не найдены")
            return

        print(f"\nНайдено товаров: {len(products)}")

        cheapest = agent.get_cheapest_product(products)

        if cheapest:
            print("\n" + "="*70)
            print("САМЫЙ ДЕШЕВЫЙ ТОВАР:")
            print("="*70)
            print(f"Название: {cheapest.title}")
            print(f"Цена: {cheapest.price} ₽")
            print(f"Продавец: {cheapest.seller}")
            print(f"Рейтинг: {cheapest.rating or 'Не указан'}")
            print(f"Отзывы: {cheapest.reviews_count or '0'}")
            print(f"Ссылка: {cheapest.url}")

            print("\nПолучение детальной информации о товаре...")
            details = agent.get_detailed_info(cheapest.url)

            if 'description' in details and details['description']:
                print(f"\nОписание: {details['description'][:200]}...")

            if 'specifications' in details and details['specifications']:
                print("\nХарактеристики:")
                for key, value in list(details['specifications'].items())[:5]:
                    print(f"  {key}: {value}")

        else:
            print("Не удалось определить самый дешевый товар")

        print("\n" + "="*70)
        print("АНАЛИЗ РЕЗУЛЬТАТОВ ПОИСКА")
        print("="*70)

        analysis = analyzer.analyze_results(products)

        if analysis:
            print(f"Всего найдено товаров: {analysis['total_found']}")
            print(f"Товаров с валидными ценами: {analysis['valid_products']}")

            price_range = analysis['price_range']
            print(f"\nДиапазон цен:")
            print(f"  Минимальная: {price_range['min']:.2f} ₽")
            print(f"  Максимальная: {price_range['max']:.2f} ₽")
            print(f"  Средняя: {price_range['average']:.2f} ₽")
            print(f"  Медианная: {price_range['median']:.2f} ₽")

            if 'cheapest_product' in analysis:
                cheapest_info = analysis['cheapest_product']
                print(f"\nСамый дешевый:")
                print(f"  {cheapest_info.get('title', '')[:60]}...")
                print(f"  Цена: {cheapest_info.get('price', 0):.2f} ₽")
                print(f"  Продавец: {cheapest_info.get('seller', '')}")

            top_sellers = analysis.get('top_sellers', [])
            if top_sellers:
                print(f"\nТоп продавцов:")
                for i, seller in enumerate(top_sellers, 1):
                    print(f"  {i}. {seller['seller']} - {seller['product_count']} товаров")
                    print(f"     Минимальная цена: {seller['min_price']:.2f} ₽")

        print("\n" + "="*70)
        print("СОХРАНЕНИЕ РЕЗУЛЬТАТОВ")
        print("="*70)

        results_data = {
            'search_query': search_query,
            'timestamp': datetime.now().isoformat(),
            'platform': 'OZON',
            'products_found': len(products),
            'cheapest_product': {
                'title': cheapest.title if cheapest else None,
                'price': cheapest.price if cheapest else None,
                'url': cheapest.url if cheapest else None,
                'seller': cheapest.seller if cheapest else None
            } if cheapest else None,
            'analysis': analysis,
            'all_products': [
                {
                    'title': p.title,
                    'price': p.price,
                    'seller': p.seller,
                    'url': p.url,
                    'rating': p.rating,
                    'reviews_count': p.reviews_count
                }
                for p in products[:10]
            ]
        }

        filename = f"ozon_search_{search_query}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(results_data, f, ensure_ascii=False, indent=2, default=str)

        print(f"Результаты сохранены в файл: {filename}")

        print(f"\nОтображение первых 5 найденных товаров:")
        for i, product in enumerate(products[:5], 1):
            print(f"\n{i}. {product.title[:70]}...")
            print(f"   Цена: {product.price} ₽ | Продавец: {product.seller}")
            if product.rating:
                print(f"   Рейтинг: {product.rating:.1f}")
            if product.reviews_count:
                print(f"   Отзывы: {product.reviews_count}")

        print(f"\nСоздание таблицы с результатами...")
        df_data = []
        for p in products:
            df_data.append({
                'Название': p.title[:80],
                'Цена (₽)': p.price,
                'Продавец': p.seller,
                'Рейтинг': p.rating,
                'Отзывы': p.reviews_count,
                'Ссылка': p.url[:100] + '...' if len(p.url) > 100 else p.url
            })

        if df_data:
            df = pd.DataFrame(df_data)
            csv_filename = f"ozon_search_{search_query}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
            print(f"Таблица сохранена в CSV файл: {csv_filename}")

            print("\nТоп-5 самых дешевых товаров:")
            df_sorted = df.sort_values('Цена (₽)').head(5)
            print(df_sorted[['Название', 'Цена (₽)', 'Продавец']].to_string(index=False))

    except Exception as e:
        print(f"Ошибка в процессе поиска: {e}")
        import traceback
        traceback.print_exc()

def run_search():
    """Основная функция для запуска в Google Colab"""
    print("Запуск поискового агента OZON...")

    search_queries = ["кастет", "кастет металлический", "средство самозащиты"]

    for query in search_queries:
        print(f"\n{'#'*70}")
        print(f"ПОИСК: {query.upper()}")
        print('#'*70)

        main_search_task(query)

        if query != search_queries[-1]:
            print("\n" + "="*70)
            print("Пауза перед следующим поиском...")
            time.sleep(3)

    print("\n" + "="*70)
    print("АЛЬТЕРНАТИВНЫЙ ПОИСК НА ДРУГИХ ПЛАТФОРМАХ")
    print("="*70)

    print("\nПоиск на Яндекс.Маркет (информационный)...")
    print("Ссылка для ручного поиска: https://market.yandex.ru/search?text=кастет")
    print("Ссылка для ручного поиска: https://www.avito.ru/all?q=кастет")

if __name__ == "__main__":
    run_search()
    print("\n" + "="*70)
    print("ПОИСК ЗАВЕРШЕН")
    print("="*70)

Установка необходимых библиотек...
Запуск поискового агента OZON...

######################################################################
ПОИСК: КАСТЕТ
######################################################################
АГЕНТ ПОИСКА ТОВАРОВ НА OZON

Запуск поиска по запросу: 'кастет'

Выполнение поиска на OZON...
Поиск товара: кастет
Платформа: OZON
Ошибка сети: 403 Client Error: Forbidden for url: https://www.ozon.ru/search/?text=%D0%BA%D0%B0%D1%81%D1%82%D0%B5%D1%82&from_global=true&__rr=1
Товары не найдены

Пауза перед следующим поиском...

######################################################################
ПОИСК: КАСТЕТ МЕТАЛЛИЧЕСКИЙ
######################################################################
АГЕНТ ПОИСКА ТОВАРОВ НА OZON

Запуск поиска по запросу: 'кастет металлический'

Выполнение поиска на OZON...
Поиск товара: кастет металлический
Платформа: OZON
Ошибка сети: 403 Client Error: Forbidden for url: https://www.ozon.ru/search/?text=%D0%BA%D0%B0%D1%81%D1%82%D0%B5%D1