# Data collection
Сбор данных с магазина kaspi.kz о ёлках для построения модели машинного обучения, предсказывающей цену товара.

# 1. Инициализация

### Импорт необходимых библиотек

In [3]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Получение страницы

In [5]:
url = 'https://kaspi.kz/shop/search/?text=%D1%91%D0%BB%D0%BA%D0%B0&q=%3AavailableInZones%3AMagnum_ZONE1&sort=relevance&filteredByCategory=false&sc=' 
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
page = requests.get(url, headers = headers)

print(page.status_code)

200


# 2. Веб-скрейпинг

### Конвертация значения цены товара на месте

In [41]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

options = Options()
options.add_argument("--start-maximized")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

driver.get("https://kaspi.kz/shop/search/?text=%D1%91%D0%BB%D0%BA%D0%B0&q=%3AavailableInZones%3AMagnum_ZONE1&sort=relevance&filteredByCategory=false&sc=")

WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Астана")))
link = driver.find_element(By.LINK_TEXT, "Астана")
link.click()

WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".item-card")))

time.sleep(1)

product_containers = driver.find_elements(By.CSS_SELECTOR, ".item-card")

for product in product_containers:
    try:
        product_link = product.find_element(By.CSS_SELECTOR, "a.item-card__image-wrapper")
        product_link.click()
        
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".item__price-once")))
        
        price_element = driver.find_element(By.CSS_SELECTOR, ".item__price-once")
        price = price_element.text

        price_draft = price.replace('\xa0', '').strip()
        price_no_spaces = price_draft.replace(' ', '')
        price_cleaned = price_no_spaces.replace('₸', '')

        cleaned_price_int = int(price_cleaned)
        print(cleaned_price_int)

        driver.back()

        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".item-card")))

        product_containers = driver.find_elements(By.CSS_SELECTOR, ".item-card")
        
        time.sleep(1)

    except Exception as e:
        print(f"Error with product: {e}")
        continue

driver.quit()


64999
24421
1999
35790
41888
32990
30899
8299
39899
44800
26619
11000


### Обозначение функции для перехода на следующую страницу

In [74]:
def go_to_next_page(driver):
    try:
        active = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "li.pagination__el._active")
            )
        )

        next_page = driver.execute_script("""
            return arguments[0].nextElementSibling;
        """, active)

        if next_page is None:
            return False

        driver.execute_script("arguments[0].click();", next_page)
        time.sleep(2)
        return True

    except Exception as e:
        print("Pagination stop:", e)
        return False

### Сбор данных с учетом пагинации

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

options = Options()
options.add_argument("--start-maximized")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

driver.get(
    "https://kaspi.kz/shop/search/?text=%D1%91%D0%BB%D0%BA%D0%B0&q=%3AavailableInZones%3AMagnum_ZONE1"
)

WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.LINK_TEXT, "Алматы"))
).click()

possible_terms = [
    "Код товара", "Тип", "Высота", "Материал веток", "Вес", "Цвет",
    "Материал", "Оптоволокно", "Тип конструкции", "Ветви",
    "Особенности", "Количество ярусов", "Диаметр нижних веток",
    "Крепление веток", "Сборка", "Подставка", "Дополнительная информация", "Цена"
]

all_products = []

for page in range(84):
    
    print(f"Scraping page {page}")

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".item-card"))
    )

    products_count = len(driver.find_elements(By.CSS_SELECTOR, ".item-card"))

    for i in range(products_count):
        try:
            products = driver.find_elements(By.CSS_SELECTOR, ".item-card")
            driver.execute_script(
                "arguments[0].click();",
                products[i].find_element(By.CSS_SELECTOR, "a.item-card__image-wrapper")
            )


            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".item__price-once")))
            price_element = driver.find_element(By.CSS_SELECTOR, ".item__price-once")
            price = price_element.text
    
            price_draft = price.replace('\xa0', '').strip()
            price_no_spaces = price_draft.replace(' ', '')
            price_cleaned = price_no_spaces.replace('₸', '')
    
            cleaned_price_int = int(price_cleaned)

            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "dl.specifications-list__spec")
                )
            )

            product_data = {}

            specs = driver.find_elements(
                By.CSS_SELECTOR, "dl.specifications-list__spec"
            )

            for spec in specs:
                key = spec.find_element(
                    By.CSS_SELECTOR,
                    ".specifications-list__spec-term-text"
                ).get_attribute("textContent").strip().rstrip(":")

                value = spec.find_element(
                    By.CSS_SELECTOR,
                    ".specifications-list__spec-definition"
                ).get_attribute("textContent").strip()

                product_data[key] = value
            

            for term in possible_terms:
                product_data.setdefault(term, None)
            product_data['Цена'] = cleaned_price_int
            all_products.append(product_data)

            driver.back()
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".item-card"))
            )

        except Exception as e:
            print(f"Skipped product {i}: {e}")
            driver.back()
            continue
    if not go_to_next_page(driver):
        print("Reached last page")
        break

driver.quit()

df = pd.DataFrame(all_products)
df = df[possible_terms]
df.to_excel("kaspi_products_85_done.xlsx", index=False)
print("Done ✔")

Scraping page 0
Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
Scraping page 31
Scraping page 32
Scraping page 33
Scraping page 34
Scraping page 35
Scraping page 36
Scraping page 37
Scraping page 38
Scraping page 39
Scraping page 40
Scraping page 41
Scraping page 42
Scraping page 43
Scraping page 44
Scraping page 45
Scraping page 46
Scraping page 47
Scraping page 48
Scraping page 49
Scraping page 50
Scraping page 51
Scraping page 52
Scraping page 53
Scraping page 54
Scraping page 55
Scraping page 56
Scraping page 57
Scraping page 58
Scrapin

# 3. Проверка

In [86]:
df.head()

Unnamed: 0,Код товара,Тип,Высота,Материал веток,Вес,Цвет,Материал,Оптоволокно,Тип конструкции,Ветви,Особенности,Количество ярусов,Диаметр нижних веток,Крепление веток,Сборка,Подставка,Дополнительная информация,Цена
0,151572935,комнатная напольная,210.0 см,литая,9.0 кг,зеленый,Хвоя литая,Нет,ствольная,,,3,170.0 см,вставные,разборная,Металл,,64999
1,100769320,комнатная напольная,210.0 см,ПВХ,3.0 кг,зеленый,ПВХ,Нет,ствольная,с белыми кончиками,,3,120.0 см,отгибающиеся,разборная,есть,,19000
2,100769294,комнатная напольная,180.0 см,ПВХ,2.0 кг,зеленый,ПВХ,Нет,ствольная,с белыми кончиками,,3,100.0 см,отгибающиеся,разборная,есть,фирменная коробка Fnix,16700
3,114570768,комнатная напольная,180.0 см,литая,9.0 кг,зеленый,Литая,Нет,ствольная,ПВХ,противопожарная,3,124.0 см,на шарнирах,складная,Металл,Это роскошная ель с пышной кроной. Хвоя темно ...,24421
4,114434859,комнатная напольная,210.0 см,комбинированная,8.0 кг,зеленый,Литая пвх,Нет,ствольная,124.0,противопожарная,3,124.0 см,отгибающиеся,разборная,металлическая,,44990
