In [17]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
base_url = "https://krisha.kz"
search_url = "https://krisha.kz/prodazha/kvartiry/almaty/"

# Initialize a list to store apartment links
apartment_links = []

# Assuming a maximum of 100 pages, adjust as needed
for page_num in range(1, 2):
    # Append the current page number to the search URL
    current_page_url = f"{search_url}?page={page_num}"

    # Send request and parse the page
    response = requests.get(current_page_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Check if the page has any listings
    listings = soup.find_all("a", class_="a-card__title")
    if not listings:
        break  # No more listings, exit the loop

    # Extract and store the links
    for link in listings:
        apartment_links.append(base_url + link.get('href'))

# Print the total number of links found
print(f"Found {len(apartment_links)} apartment links.")

Found 23 apartment links.


In [23]:
def get_apartment_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting room count
    title_text = soup.find("div", class_="offer__advert-title").find("h1").text
    room_count_match = re.search(r'(\d+)-комнатная', title_text)
    room_count = int(room_count_match.group(1)) if room_count_match else None


    # Extracting quadrature
    quadrature = float(title_text.split(',')[1].split()[0])

    # Extracting floor
    floor_div = soup.find("div", {"data-name": "flat.floor"})
    floor_info_text = floor_div.find("div", class_="offer__advert-short-info").text
    floor_info = floor_info_text.split('из')
    floor = round(float(floor_info[0].strip()) / float(floor_info[1].strip()), 2)


    # Extracting region
    region = soup.find("div", class_="offer__location offer__advert-short-info").find("span").text

    # Extracting year
    year_div = soup.find("div", {"data-name": "house.year"})
    year_text = year_div.find("div", class_="offer__advert-short-info").text
    year = int(''.join(filter(str.isdigit, year_text)))

    # Extracting price
    price_text = soup.find("div", class_="offer__price").text
    price = int("".join(filter(str.isdigit, price_text)))

    return {"room_count": room_count,
            "quadrature": quadrature,
            "floor": floor,
            "region": region,
            "year": year,
            "price": price}


In [24]:
data = []
for link in apartment_links:
    try:
        apartment_data = get_apartment_data(link)
        data.append(apartment_data)
    except Exception as e:
        print(f"Failed to extract data from {link}. Error: {e}")


In [25]:
data

[{'room_count': 4,
  'quadrature': 190.0,
  'floor': 0.67,
  'region': 'Алматы, Медеуский р-н',
  'year': 1997,
  'price': 175000000},
 {'room_count': 3,
  'quadrature': 70.0,
  'floor': 0.5,
  'region': 'Алматы, Бостандыкский р-н',
  'year': 1957,
  'price': 45000000},
 {'room_count': 3,
  'quadrature': 66.0,
  'floor': 0.2,
  'region': 'Алматы, Алмалинский р-н',
  'year': 1976,
  'price': 51500000},
 {'room_count': 3,
  'quadrature': 78.0,
  'floor': 1.0,
  'region': 'Алматы, Наурызбайский р-н',
  'year': 2019,
  'price': 40500000},
 {'room_count': 2,
  'quadrature': 63.4,
  'floor': 0.89,
  'region': 'Алматы, Турксибский р-н',
  'year': 2017,
  'price': 37900000},
 {'room_count': 3,
  'quadrature': 80.0,
  'floor': 1.0,
  'region': 'Алматы, Алмалинский р-н',
  'year': 1978,
  'price': 56000000},
 {'room_count': 2,
  'quadrature': 67.0,
  'floor': 0.22,
  'region': 'Алматы, Медеуский р-н',
  'year': 2021,
  'price': 54000000},
 {'room_count': 1,
  'quadrature': 33.0,
  'floor': 0.8,
