# Parsing of krisha.com

This project involves web scraping apartment rental data from Krisha.kz, focusing on listings in five districts of, including 
- __Bostandyk__
- __Nauryzbai__
- __Auezov__ 
- __Almaly__
- __Qaskelen__

   The goal is to extract and analyze detailed information about rental apartments to identify trends, compare prices, and understand market dynamics. 


In [3]:
import pandas as pd 
from bs4 import BeautifulSoup
import requests
import lxml
import time
import numpy as np
from random import choice
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# BOSTANDYQ

In [75]:
bost_url = "https://krisha.kz/arenda/kvartiry/almaty-bostandykskij/?das[live.rooms][0]=1&das[live.rooms][1]=2&rent-period-switch=%2Farenda%2Fkvartiry"

headers = {
    "accept":  "*/*",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
}


rooms = []
areas = []
floors = []
prices = []
securities = []
locations = []
states = []
furnitures = []
prev_dorms = []
zhks = []
studio_kitchens = []
deep_info_furnitures = []
flat_facilitiess = []
separated_toilets = []
toilet_counts = []
bathrooms = []
window_sides = []
balcony_counts = []
loggia_counts = []
who_matches = []


# Set up session with retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))




def web_scraping(url):
    try:
        req = session.get(url, headers=headers, timeout=20)  # Set a timeout of 10 seconds
        req.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return

    src = req.text
    soup = BeautifulSoup(src, "lxml")
    listings = soup.find(class_="a-list a-search-list a-list-with-favs").find_all(class_="a-card__inc")

    for jobs in listings:
        try:
            href = jobs.find(class_="a-card__main-info").find(class_="a-card__header-left").find(class_="a-card__title").get("href")
            house_url = f'https://krisha.kz{href}'
            house_response = session.get(house_url, headers=headers, timeout=10)  # Add timeout here too
            house_soup = BeautifulSoup(house_response.text, 'lxml')
            houses = house_soup.find(class_="offer__container").find(class_="offer__advert-info")
            deep_houses = house_soup.find(class_ = "offer__description")
        except (AttributeError, requests.exceptions.RequestException):
            continue






        
              
        title = jobs.find(class_ = "a-card__header-left").find(class_ = "a-card__title").text.strip()
        


        
        parts = [part.strip() for part in title.split(",")]
        
         


        
        room = parts[0] if len(parts) > 0 else ""
        

        
        try:
            area = houses.find(attrs={"data-name": "live.square"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            area = np.nan    


        try:
            floor = houses.find(attrs={"data-name": "flat.floor"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            floor = np.nan


        try:
            state = houses.find(attrs={"data-name": "flat.rent_renovation"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            state = np.nan



        try:
            furniture = houses.find(attrs={"data-name": "live.furniture"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            furniture = np.nan


        try:
            security = houses.find(attrs={"data-name": "flat.security"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            security = np.nan


        try:
            prev_dorm = houses.find(attrs={"data-name": "flat.priv_dorm"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            prev_dorm = np.nan

        
        try:
            zhk = houses.find(attrs={"data-name": "map.complex"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            zhk = np.nan

        
        try:
            studio_kitchen = houses.find(attrs={"data-name": "kitchen_studio"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            studio_kitchen = np.nan
        

        try:
            price = houses.find(class_ = "offer__price").text.strip()
        except AttributeError:
            price = np.nan


        try:
            location = houses.find(class_ = "offer__info-item").find(class_ = "offer__location offer__advert-short-info").text.strip()
        except AttributeError:
            location = np.nan



        deep_info_furniture = np.nan
        flat_facilities = np.nan
        separated_toilet = np.nan
        toilet_count = np.nan
        bathroom = np.nan
        window_side = np.nan
        balcony_count = np.nan
        loggia_count = np.nan
        who_match = np.nan

        
        for dt_tag in deep_houses.find_all("dt"):



            
            

            
            if dt_tag.get("data-name") == "flat.furniture":
                try:
                    deep_info_furniture = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    deep_info_furniture = np.nan    

              

            

            if dt_tag.get("data-name") == "flat.facilities":

                try:
                    flat_facilities = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    flat_facilities = np.nan

            
                

            
            
            
            if dt_tag.get("data-name") == "separated_toilet":

                try:
                    separated_toilet =  dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    separated_toilet = np.nan
                

            

        
            
            
            
            if dt_tag.get("data-name") == "toilet_count":

                try:
                    toilet_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    toilet_count = np.nan


        

            
            
            
            
            if dt_tag.get("data-name") == "bathroom":

                try:
                    bathroom = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    bathroom = np.nan

            

            
           
            
            if dt_tag.get("data-name") == "window_side":

                try:
                    window_side = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    window_side = np.nan


            




            
            if dt_tag.get("data-name") == "balcony_count":

                try:
                    balcony_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    balcony_count = np.nan


           



            
            if dt_tag.get("data-name") == "loggia_count":

                try:
                    loggia_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    loggia_count = np.nan


           



            
            if dt_tag.get("data-name") == "who_match":

                try:
                    who_match = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    who_match = np.nan
           
        


        


        


        

        rooms.append(room)
        areas.append(area)
        floors.append(floor)
        prices.append(price)
        securities.append(security)
        locations.append(location)
        studio_kitchens.append(studio_kitchen)
        zhks.append(zhk)
        prev_dorms.append(prev_dorm)
        furnitures.append(furniture)
        states.append(state)
        deep_info_furnitures.append(deep_info_furniture)
        flat_facilitiess.append(flat_facilities)
        separated_toilets.append(separated_toilet)
        toilet_counts.append(toilet_count)
        bathrooms.append(bathroom)
        window_sides.append(window_side)
        balcony_counts.append(balcony_count)
        loggia_counts.append(loggia_count)
        who_matches.append(who_match)


        time.sleep(3)

for page_num in range(1, 35):
    #We can just add page number at the end
    page_url = f"{bost_url}&page={page_num}"
    web_scraping(page_url)
    # It is essential to wait some time before actions, because a website may restrict its access
    time.sleep(10)



print(rooms)

['1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная кв

In [77]:
data = pd.DataFrame({
    'Room': rooms,
    'Area': areas,
    'Floor': floors,
    'Price': prices,
    'Security': securities,
    'Location': locations,
    'Zhk': zhks,
    'Studio_kitchen': studio_kitchens,
    'Previous_dorm': prev_dorms,
    'Furniture': furnitures,
    'State': states,
    'Deep_info_furniture': deep_info_furnitures,
    'Flat_facilities': flat_facilitiess,
    'Separated_toilet': separated_toilets,
    'Toilet_count': toilet_counts,
    'Bathroom': bathrooms,
    'Window_side': window_sides,
    'Balcony_count': balcony_counts,
    'Loggia_count': loggia_counts,
    'Who_can_stay': who_matches
})

data.to_csv('podrob_bost_krisha.csv', index=False)




# NAURYZBAI

In [78]:
naur_url = "https://krisha.kz/arenda/kvartiry/almaty-nauryzbajskiy/?rent-period-switch=%2Farenda%2Fkvartiry"

headers = {
    "accept":  "*/*",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
}


rooms = []
areas = []
floors = []
prices = []
securities = []
locations = []
states = []
furnitures = []
prev_dorms = []
zhks = []
studio_kitchens = []
deep_info_furnitures = []
flat_facilitiess = []
separated_toilets = []
toilet_counts = []
bathrooms = []
window_sides = []
balcony_counts = []
loggia_counts = []
who_matches = []


# Set up session with retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))




def web_scraping(url):
    try:
        req = session.get(url, headers=headers, timeout=20)  # Set a timeout of 10 seconds
        req.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return

    src = req.text
    soup = BeautifulSoup(src, "lxml")
    listings = soup.find(class_="a-list a-search-list a-list-with-favs").find_all(class_="a-card__inc")

    for jobs in listings:
        try:
            href = jobs.find(class_="a-card__main-info").find(class_="a-card__header-left").find(class_="a-card__title").get("href")
            house_url = f'https://krisha.kz{href}'
            house_response = session.get(house_url, headers=headers, timeout=10)  # Add timeout here too
            house_soup = BeautifulSoup(house_response.text, 'lxml')
            houses = house_soup.find(class_="offer__container").find(class_="offer__advert-info")
            deep_houses = house_soup.find(class_ = "offer__description")
        except (AttributeError, requests.exceptions.RequestException):
            continue






        
              
        title = jobs.find(class_ = "a-card__header-left").find(class_ = "a-card__title").text.strip()
        


        
        parts = [part.strip() for part in title.split(",")]
        
         


        
        room = parts[0] if len(parts) > 0 else ""
        

        
        try:
            area = houses.find(attrs={"data-name": "live.square"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            area = np.nan    


        try:
            floor = houses.find(attrs={"data-name": "flat.floor"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            floor = np.nan


        try:
            state = houses.find(attrs={"data-name": "flat.rent_renovation"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            state = np.nan



        try:
            furniture = houses.find(attrs={"data-name": "live.furniture"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            furniture = np.nan


        try:
            security = houses.find(attrs={"data-name": "flat.security"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            security = np.nan


        try:
            prev_dorm = houses.find(attrs={"data-name": "flat.priv_dorm"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            prev_dorm = np.nan

        
        try:
            zhk = houses.find(attrs={"data-name": "map.complex"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            zhk = np.nan

        
        try:
            studio_kitchen = houses.find(attrs={"data-name": "kitchen_studio"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            studio_kitchen = np.nan
        

        try:
            price = houses.find(class_ = "offer__price").text.strip()
        except AttributeError:
            price = np.nan


        try:
            location = houses.find(class_ = "offer__info-item").find(class_ = "offer__location offer__advert-short-info").text.strip()
        except AttributeError:
            location = np.nan



        deep_info_furniture = np.nan
        flat_facilities = np.nan
        separated_toilet = np.nan
        toilet_count = np.nan
        bathroom = np.nan
        window_side = np.nan
        balcony_count = np.nan
        loggia_count = np.nan
        who_match = np.nan

        
        for dt_tag in deep_houses.find_all("dt"):



            
            

            
            if dt_tag.get("data-name") == "flat.furniture":
                try:
                    deep_info_furniture = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    deep_info_furniture = np.nan    

              

            

            if dt_tag.get("data-name") == "flat.facilities":

                try:
                    flat_facilities = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    flat_facilities = np.nan

            
                

            
            
            
            if dt_tag.get("data-name") == "separated_toilet":

                try:
                    separated_toilet =  dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    separated_toilet = np.nan
                

            

        
            
            
            
            if dt_tag.get("data-name") == "toilet_count":

                try:
                    toilet_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    toilet_count = np.nan


        

            
            
            
            
            if dt_tag.get("data-name") == "bathroom":

                try:
                    bathroom = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    bathroom = np.nan

            

            
           
            
            if dt_tag.get("data-name") == "window_side":

                try:
                    window_side = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    window_side = np.nan


            




            
            if dt_tag.get("data-name") == "balcony_count":

                try:
                    balcony_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    balcony_count = np.nan


           



            
            if dt_tag.get("data-name") == "loggia_count":

                try:
                    loggia_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    loggia_count = np.nan


           



            
            if dt_tag.get("data-name") == "who_match":

                try:
                    who_match = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    who_match = np.nan
           
        


        


        


        

        rooms.append(room)
        areas.append(area)
        floors.append(floor)
        prices.append(price)
        securities.append(security)
        locations.append(location)
        studio_kitchens.append(studio_kitchen)
        zhks.append(zhk)
        prev_dorms.append(prev_dorm)
        furnitures.append(furniture)
        states.append(state)
        deep_info_furnitures.append(deep_info_furniture)
        flat_facilitiess.append(flat_facilities)
        separated_toilets.append(separated_toilet)
        toilet_counts.append(toilet_count)
        bathrooms.append(bathroom)
        window_sides.append(window_side)
        balcony_counts.append(balcony_count)
        loggia_counts.append(loggia_count)
        who_matches.append(who_match)


        time.sleep(3)

for page_num in range(1, 30):
    #We can just add page number at the end
    page_url = f"{naur_url}&page={page_num}"
    web_scraping(page_url)
    # It is essential to wait some time before actions, because a website may restrict its access
    time.sleep(10)



print(rooms)

['2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '3-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '3-комнатная квартира', '2-комнатная квартира', '3-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '3-комнатная квартира', '2-комнатная квартира', '3-комнатная кв

In [79]:
data = pd.DataFrame({
    'Room': rooms,
    'Area': areas,
    'Floor': floors,
    'Price': prices,
    'Security': securities,
    'Location': locations,
    'Zhk': zhks,
    'Studio_kitchen': studio_kitchens,
    'Previous_dorm': prev_dorms,
    'Furniture': furnitures,
    'State': states,
    'Deep_info_furniture': deep_info_furnitures,
    'Flat_facilities': flat_facilitiess,
    'Separated_toilet': separated_toilets,
    'Toilet_count': toilet_counts,
    'Bathroom': bathrooms,
    'Window_side': window_sides,
    'Balcony_count': balcony_counts,
    'Loggia_count': loggia_counts,
    'Who_can_stay': who_matches
})

data.to_csv('podrob_nauryzb_krisha.csv', index=False)




# AUEZOV

In [80]:
auez_url = "https://krisha.kz/arenda/kvartiry/almaty-aujezovskij/?das[live.rooms][0]=1&das[live.rooms][1]=2&rent-period-switch=%2Farenda%2Fkvartiry"

headers = {
    "accept":  "*/*",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
}


rooms = []
areas = []
floors = []
prices = []
securities = []
locations = []
states = []
furnitures = []
prev_dorms = []
zhks = []
studio_kitchens = []
deep_info_furnitures = []
flat_facilitiess = []
separated_toilets = []
toilet_counts = []
bathrooms = []
window_sides = []
balcony_counts = []
loggia_counts = []
who_matches = []


# Set up session with retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))




def web_scraping(url):
    try:
        req = session.get(url, headers=headers, timeout=20)  # Set a timeout of 10 seconds
        req.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return

    src = req.text
    soup = BeautifulSoup(src, "lxml")
    listings = soup.find(class_="a-list a-search-list a-list-with-favs").find_all(class_="a-card__inc")

    for jobs in listings:
        try:
            href = jobs.find(class_="a-card__main-info").find(class_="a-card__header-left").find(class_="a-card__title").get("href")
            house_url = f'https://krisha.kz{href}'
            house_response = session.get(house_url, headers=headers, timeout=10)  # Add timeout here too
            house_soup = BeautifulSoup(house_response.text, 'lxml')
            houses = house_soup.find(class_="offer__container").find(class_="offer__advert-info")
            deep_houses = house_soup.find(class_ = "offer__description")
        except (AttributeError, requests.exceptions.RequestException):
            continue






        
              
        title = jobs.find(class_ = "a-card__header-left").find(class_ = "a-card__title").text.strip()
        


        
        parts = [part.strip() for part in title.split(",")]
        
         


        
        room = parts[0] if len(parts) > 0 else ""
        

        
        try:
            area = houses.find(attrs={"data-name": "live.square"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            area = np.nan    


        try:
            floor = houses.find(attrs={"data-name": "flat.floor"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            floor = np.nan


        try:
            state = houses.find(attrs={"data-name": "flat.rent_renovation"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            state = np.nan



        try:
            furniture = houses.find(attrs={"data-name": "live.furniture"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            furniture = np.nan


        try:
            security = houses.find(attrs={"data-name": "flat.security"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            security = np.nan


        try:
            prev_dorm = houses.find(attrs={"data-name": "flat.priv_dorm"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            prev_dorm = np.nan

        
        try:
            zhk = houses.find(attrs={"data-name": "map.complex"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            zhk = np.nan

        
        try:
            studio_kitchen = houses.find(attrs={"data-name": "kitchen_studio"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            studio_kitchen = np.nan
        

        try:
            price = houses.find(class_ = "offer__price").text.strip()
        except AttributeError:
            price = np.nan


        try:
            location = houses.find(class_ = "offer__info-item").find(class_ = "offer__location offer__advert-short-info").text.strip()
        except AttributeError:
            location = np.nan



        deep_info_furniture = np.nan
        flat_facilities = np.nan
        separated_toilet = np.nan
        toilet_count = np.nan
        bathroom = np.nan
        window_side = np.nan
        balcony_count = np.nan
        loggia_count = np.nan
        who_match = np.nan

        
        for dt_tag in deep_houses.find_all("dt"):



            
            

            
            if dt_tag.get("data-name") == "flat.furniture":
                try:
                    deep_info_furniture = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    deep_info_furniture = np.nan    

              

            

            if dt_tag.get("data-name") == "flat.facilities":

                try:
                    flat_facilities = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    flat_facilities = np.nan

            
                

            
            
            
            if dt_tag.get("data-name") == "separated_toilet":

                try:
                    separated_toilet =  dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    separated_toilet = np.nan
                

            

        
            
            
            
            if dt_tag.get("data-name") == "toilet_count":

                try:
                    toilet_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    toilet_count = np.nan


        

            
            
            
            
            if dt_tag.get("data-name") == "bathroom":

                try:
                    bathroom = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    bathroom = np.nan

            

            
           
            
            if dt_tag.get("data-name") == "window_side":

                try:
                    window_side = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    window_side = np.nan


            




            
            if dt_tag.get("data-name") == "balcony_count":

                try:
                    balcony_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    balcony_count = np.nan


           



            
            if dt_tag.get("data-name") == "loggia_count":

                try:
                    loggia_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    loggia_count = np.nan


           



            
            if dt_tag.get("data-name") == "who_match":

                try:
                    who_match = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    who_match = np.nan
           
        


        


        


        

        rooms.append(room)
        areas.append(area)
        floors.append(floor)
        prices.append(price)
        securities.append(security)
        locations.append(location)
        studio_kitchens.append(studio_kitchen)
        zhks.append(zhk)
        prev_dorms.append(prev_dorm)
        furnitures.append(furniture)
        states.append(state)
        deep_info_furnitures.append(deep_info_furniture)
        flat_facilitiess.append(flat_facilities)
        separated_toilets.append(separated_toilet)
        toilet_counts.append(toilet_count)
        bathrooms.append(bathroom)
        window_sides.append(window_side)
        balcony_counts.append(balcony_count)
        loggia_counts.append(loggia_count)
        who_matches.append(who_match)


        time.sleep(3)

for page_num in range(1, 35):
    #We can just add page number at the end
    page_url = f"{auez_url}&page={page_num}"
    web_scraping(page_url)
    # It is essential to wait some time before actions, because a website may restrict its access
    time.sleep(10)



print(rooms)

['2-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная кв

In [82]:
data = pd.DataFrame({
    'Room': rooms,
    'Area': areas,
    'Floor': floors,
    'Price': prices,
    'Security': securities,
    'Location': locations,
    'Zhk': zhks,
    'Studio_kitchen': studio_kitchens,
    'Previous_dorm': prev_dorms,
    'Furniture': furnitures,
    'State': states,
    'Deep_info_furniture': deep_info_furnitures,
    'Flat_facilities': flat_facilitiess,
    'Separated_toilet': separated_toilets,
    'Toilet_count': toilet_counts,
    'Bathroom': bathrooms,
    'Window_side': window_sides,
    'Balcony_count': balcony_counts,
    'Loggia_count': loggia_counts,
    'Who_can_stay': who_matches
})

data.to_csv('podrob_auezov_krisha.csv', index=False)




In [1]:
# ALMALY

In [83]:
almal_url = "https://krisha.kz/arenda/kvartiry/almaty-bostandykskij/?das[live.rooms][0]=1&das[live.rooms][1]=2&rent-period-switch=%2Farenda%2Fkvartiry"

headers = {
    "accept":  "*/*",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
}


rooms = []
areas = []
floors = []
prices = []
securities = []
locations = []
states = []
furnitures = []
prev_dorms = []
zhks = []
studio_kitchens = []
deep_info_furnitures = []
flat_facilitiess = []
separated_toilets = []
toilet_counts = []
bathrooms = []
window_sides = []
balcony_counts = []
loggia_counts = []
who_matches = []


# Set up session with retries
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))




def web_scraping(url):
    try:
        req = session.get(url, headers=headers, timeout=20)  # Set a timeout of 10 seconds
        req.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return

    src = req.text
    soup = BeautifulSoup(src, "lxml")
    listings = soup.find(class_="a-list a-search-list a-list-with-favs").find_all(class_="a-card__inc")

    for jobs in listings:
        try:
            href = jobs.find(class_="a-card__main-info").find(class_="a-card__header-left").find(class_="a-card__title").get("href")
            house_url = f'https://krisha.kz{href}'
            house_response = session.get(house_url, headers=headers, timeout=10)  # Add timeout here too
            house_soup = BeautifulSoup(house_response.text, 'lxml')
            houses = house_soup.find(class_="offer__container").find(class_="offer__advert-info")
            deep_houses = house_soup.find(class_ = "offer__description")
        except (AttributeError, requests.exceptions.RequestException):
            continue






        
              
        title = jobs.find(class_ = "a-card__header-left").find(class_ = "a-card__title").text.strip()
        


        
        parts = [part.strip() for part in title.split(",")]
        
         


        
        room = parts[0] if len(parts) > 0 else ""
        

        
        try:
            area = houses.find(attrs={"data-name": "live.square"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            area = np.nan    


        try:
            floor = houses.find(attrs={"data-name": "flat.floor"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            floor = np.nan


        try:
            state = houses.find(attrs={"data-name": "flat.rent_renovation"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            state = np.nan



        try:
            furniture = houses.find(attrs={"data-name": "live.furniture"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            furniture = np.nan


        try:
            security = houses.find(attrs={"data-name": "flat.security"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            security = np.nan


        try:
            prev_dorm = houses.find(attrs={"data-name": "flat.priv_dorm"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            prev_dorm = np.nan

        
        try:
            zhk = houses.find(attrs={"data-name": "map.complex"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            zhk = np.nan

        
        try:
            studio_kitchen = houses.find(attrs={"data-name": "kitchen_studio"}).find(class_ = "offer__advert-short-info").text.strip()
        except AttributeError:
            studio_kitchen = np.nan
        

        try:
            price = houses.find(class_ = "offer__price").text.strip()
        except AttributeError:
            price = np.nan


        try:
            location = houses.find(class_ = "offer__info-item").find(class_ = "offer__location offer__advert-short-info").text.strip()
        except AttributeError:
            location = np.nan



        deep_info_furniture = np.nan
        flat_facilities = np.nan
        separated_toilet = np.nan
        toilet_count = np.nan
        bathroom = np.nan
        window_side = np.nan
        balcony_count = np.nan
        loggia_count = np.nan
        who_match = np.nan

        
        for dt_tag in deep_houses.find_all("dt"):



            
            

            
            if dt_tag.get("data-name") == "flat.furniture":
                try:
                    deep_info_furniture = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    deep_info_furniture = np.nan    

              

            

            if dt_tag.get("data-name") == "flat.facilities":

                try:
                    flat_facilities = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    flat_facilities = np.nan

            
                

            
            
            
            if dt_tag.get("data-name") == "separated_toilet":

                try:
                    separated_toilet =  dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    separated_toilet = np.nan
                

            

        
            
            
            
            if dt_tag.get("data-name") == "toilet_count":

                try:
                    toilet_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    toilet_count = np.nan


        

            
            
            
            
            if dt_tag.get("data-name") == "bathroom":

                try:
                    bathroom = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    bathroom = np.nan

            

            
           
            
            if dt_tag.get("data-name") == "window_side":

                try:
                    window_side = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    window_side = np.nan


            




            
            if dt_tag.get("data-name") == "balcony_count":

                try:
                    balcony_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    balcony_count = np.nan


           



            
            if dt_tag.get("data-name") == "loggia_count":

                try:
                    loggia_count = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    loggia_count = np.nan


           



            
            if dt_tag.get("data-name") == "who_match":

                try:
                    who_match = dt_tag.find_next_sibling("dd").text.strip()
                except AttributeError:
                    who_match = np.nan
           
        


        


        


        

        rooms.append(room)
        areas.append(area)
        floors.append(floor)
        prices.append(price)
        securities.append(security)
        locations.append(location)
        studio_kitchens.append(studio_kitchen)
        zhks.append(zhk)
        prev_dorms.append(prev_dorm)
        furnitures.append(furniture)
        states.append(state)
        deep_info_furnitures.append(deep_info_furniture)
        flat_facilitiess.append(flat_facilities)
        separated_toilets.append(separated_toilet)
        toilet_counts.append(toilet_count)
        bathrooms.append(bathroom)
        window_sides.append(window_side)
        balcony_counts.append(balcony_count)
        loggia_counts.append(loggia_count)
        who_matches.append(who_match)


        time.sleep(3)

for page_num in range(1, 35):
    #We can just add page number at the end
    page_url = f"{almal_url}&page={page_num}"
    web_scraping(page_url)
    # It is essential to wait some time before actions, because a website may restrict its access
    time.sleep(10)



print(rooms)

['1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '1-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная квартира', '2-комнатная кв

In [84]:
data = pd.DataFrame({
    'Room': rooms,
    'Area': areas,
    'Floor': floors,
    'Price': prices,
    'Security': securities,
    'Location': locations,
    'Zhk': zhks,
    'Studio_kitchen': studio_kitchens,
    'Previous_dorm': prev_dorms,
    'Furniture': furnitures,
    'State': states,
    'Deep_info_furniture': deep_info_furnitures,
    'Flat_facilities': flat_facilitiess,
    'Separated_toilet': separated_toilets,
    'Toilet_count': toilet_counts,
    'Bathroom': bathrooms,
    'Window_side': window_sides,
    'Balcony_count': balcony_counts,
    'Loggia_count': loggia_counts,
    'Who_can_stay': who_matches
})

data.to_csv('podrob_almal_krisha.csv', index=False)


