In [3]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time

In [54]:
base_url = "https://krisha.kz"
search_url = "https://krisha.kz/prodazha/kvartiry/almaty/"

# Initialize a list to store apartment links
apartment_links = []

# Assuming a maximum of 100 pages, adjust as needed
for page_num in range(1, 100):
    # Append the current page number to the search URL
    current_page_url = f"{search_url}?page={page_num}"

    # Send request and parse the page
    response = requests.get(current_page_url)
    time.sleep(1)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Check if the page has any listings
    listings = soup.find_all("a", class_="a-card__title")
    if not listings:
        break  # No more listings, exit the loop

    # Extract and store the links
    for link in listings:
        apartment_links.append(base_url + link.get('href'))

# Print the total number of links found
print(f"Found {len(apartment_links)} apartment links.")

with open('links.csv', 'a', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    for apartment_link in apartment_links:
        csv_writer.writerow(apartment_link)

Found 22 apartment links.


In [81]:
def get_apartment_data(url):
    response = requests.get(url)
    time.sleep(1)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting room count
    title_text = soup.find("div", class_="offer__advert-title").find("h1").text
    room_count_match = re.search(r'(\d+)-комнатная', title_text)
    room_count = int(room_count_match.group(1)) if room_count_match else None

    # Extracting quadrature
    try:
        quadrature = float(title_text.split(',')[1].split()[0])
    except IndexError:
        print("Index out of bound for quadrature")

    # Extracting floor
    floor_div = soup.find("div", {"data-name": "flat.floor"})
    if floor_div is not None:
        floor_info_text = floor_div.find("div", class_="offer__advert-short-info").text
        floor_info = floor_info_text.split('из')
        try:
            floor_text = floor_info[0].strip()+"/"+floor_info[-1].strip()
        except IndexError:
            print("Index out of bound for floor_text")
        floor = round(float(floor_info[0].strip()) / float(floor_info[-1].strip()), 2)
    else:
        floor_text = '1/1'
        floor = 1
        
    # Extracting region
    try:
        region_text = soup.find("div", class_="offer__location offer__advert-short-info").find("span").text.split(',')[-1].strip()
        # print(region_text)
    except IndexError:
        print("Index out of bound for region_text")
        
    if region_text == 'Алатауский р-н':
        region = 1e-10
    elif region_text == 'Алмалинский р-н':
        region = 0.075
    elif region_text == 'Ауэзовский р-н':
        region = 0.51
    elif region_text == 'Бостандыкский р-н':
        region = 1
    elif region_text == 'Жетысуский р-н':
        region = 0.076
    elif region_text == 'Медеуский р-н':
        region = 0.69
    elif region_text == 'Наурызбайский р-н':
        region = 0.11
    elif region_text == 'Турксибский р-н':
        region = 0.083
    elif region_text == 'Алматы':
        region = None

    # Extracting year
    year_div = soup.find("div", {"data-name": "house.year"})
    year_text = year_div.find("div", class_="offer__advert-short-info").text
    year = int(''.join(filter(str.isdigit, year_text)))

    # Extracting price
    
    price_text = soup.find(class_='offer__price').text
    price = int("".join(filter(str.isdigit, price_text)))

    # Extracting about
    about_dict = {}

    apartmentAbout = soup.find("div", class_="offer__parameters")

    if len(apartmentAbout) != 1:
        desired_data_names = ['flat.toilet', 'flat.balcony', 'flat.balcony_g', 'flat.door', 'inet.type', 'flat.parking',
                              'live.furniture', 'flat.flooring']

        for desired_data_name in desired_data_names:
            # Find the dt element with the specified data-name
            dt_element = apartmentAbout.find('dt', {"data-name": desired_data_name})

            about_name = desired_data_name.replace('.', '_')
            # print(about_name)
            # If the dt element is found, extract the dd value
            if dt_element:
                dd_element = dt_element.find_next_sibling("dd")
                if dd_element:
                    about_dict[about_name] = dd_element.get_text().strip()
                    # print(dd_element.get_text().strip())
            else:
                about_dict[about_name] = 'no_info'

    # Extracting ceiling
    dt_element = apartmentAbout.find('dt', {"data-name": 'ceiling'})

    if dt_element:
        dd_element = dt_element.find_next_sibling("dd")
        if dd_element:
            try:
                about_dict['ceiling'] = dd_element.get_text().strip().split()[0]
            except IndexError:
                print("Index out of bound for dd_element")
    else:
        about_dict['ceiling'] = None
        
    # Extracting flat.priv_dorm
    dt_element = apartmentAbout.find('dt', {"data-name": 'flat.priv_dorm'})

    if dt_element:
        dd_element = dt_element.find_next_sibling("dd")
        if dd_element:
            if dd_element.get_text().strip() == 'да':
                about_dict['flat_priv_dorm'] = 1
            else:
                about_dict['flat_priv_dorm'] = 0
    else:
        about_dict['flat_priv_dorm'] = 0

    total_dict = {"price": price,
                  "room_count": room_count,
                  "quadrature": quadrature,
                  "floor_text": floor_text,
                  "floor": floor,
                  "region_text": region_text,
                  "region": region,
                  "year": year}
    total_dict.update(about_dict)
    # print(total_dict)
    return total_dict

In [82]:
data = []

df = pd.read_csv('last_links_of_ap.csv')
links = df.iloc[:, 0].values
i=0
for link in links[10000:11000]:
    try:
        print(i, link)
        apartment_data = get_apartment_data(link)
        data.append(apartment_data)
        i+=1
    except Exception as e:
        print(f"Failed to extract data from {link}. Error: {e}")

0 https://krisha.kz/a/show/687747727
1 https://krisha.kz/a/show/688970880
2 https://krisha.kz/a/show/689000294
3 https://krisha.kz/a/show/688999649
4 https://krisha.kz/a/show/688171507
5 https://krisha.kz/a/show/689001087
6 https://krisha.kz/a/show/686661136
7 https://krisha.kz/a/show/689001765
8 https://krisha.kz/a/show/688768024
9 https://krisha.kz/a/show/685944682
10 https://krisha.kz/a/show/688842658
11 https://krisha.kz/a/show/685254807
12 https://krisha.kz/a/show/687344773
13 https://krisha.kz/a/show/688102368
14 https://krisha.kz/a/show/687879883
15 https://krisha.kz/a/show/685316463
16 https://krisha.kz/a/show/688690309
17 https://krisha.kz/a/show/683986051
18 https://krisha.kz/a/show/688227691
19 https://krisha.kz/a/show/688845238
20 https://krisha.kz/a/show/688568852
21 https://krisha.kz/a/show/686593369
22 https://krisha.kz/a/show/688758859
23 https://krisha.kz/a/show/688933749
24 https://krisha.kz/a/show/684760495
25 https://krisha.kz/a/show/686972239
26 https://krisha.kz/a

In [84]:
df = pd.DataFrame(data, columns=["price", "room_count", "quadrature", "floor_text", "floor", "region_text", "region", "year",  
                                 "flat_toilet", "flat_balcony", "flat_balcony_g", 
                                 "flat_door", "inet_type", "flat_parking", "live_furniture", 
                                 "flat_flooring", "ceiling"])
df.to_csv("apartments_data_10000-11000.csv", index=False)
df

Unnamed: 0,price,room_count,quadrature,floor_text,floor,region_text,region,year,flat_toilet,flat_balcony,flat_balcony_g,flat_door,inet_type,flat_parking,live_furniture,flat_flooring,ceiling
0,54000000,2,89.0,8/12,0.67,Алмалинский р-н,0.075,2015,no_info,no_info,no_info,no_info,no_info,no_info,no_info,no_info,
1,54000000,2,62.0,12/12,1.00,Бостандыкский р-н,1.000,2019,совмещенный,лоджия,да,металлическая,оптика,паркинг,полностью,ламинат,3
2,54000000,2,60.0,10/12,0.83,Бостандыкский р-н,1.000,2019,no_info,no_info,no_info,no_info,no_info,паркинг,полностью,no_info,3
3,54000000,2,63.0,5/12,0.42,Бостандыкский р-н,1.000,2019,раздельный,no_info,no_info,no_info,no_info,паркинг,no_info,no_info,3
4,54000000,3,95.0,5/5,1.00,Медеуский р-н,0.690,2021,no_info,балкон,да,металлическая,оптика,no_info,частично,ламинат,2.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,50500000,4,80.0,5/5,1.00,Ауэзовский р-н,0.510,1982,no_info,балкон и лоджия,да,металлическая,no_info,рядом охраняемая стоянка,частично,ламинат,2.8
995,50500000,3,71.0,7/9,0.78,Алмалинский р-н,0.075,1996,no_info,лоджия,no_info,металлическая,no_info,рядом охраняемая стоянка,частично,ламинат,2.7
996,50500000,4,92.0,7/9,0.78,Ауэзовский р-н,0.510,1990,no_info,балкон и лоджия,no_info,металлическая,no_info,no_info,частично,линолеум,2.85
997,50500000,4,95.0,7/9,0.78,Ауэзовский р-н,0.510,1990,no_info,несколько балконов или лоджий,да,металлическая,no_info,no_info,no_info,no_info,2.85


In [79]:
# data = []
# try:
#     print('https://krisha.kz/a/show/671314140')
#     apartment_data = get_apartment_data('https://krisha.kz/a/show/671314140')
#     data.append(apartment_data)
#     print(data)
# except Exception as e:
#     print(f"Failed to extract data from {link}. Error: {e}")

https://krisha.kz/a/show/671314140
Медеуский р-н
[{'price': 198092000, 'room_count': 5, 'quadrature': 175.31, 'floor_text': '1/1', 'floor': 1, 'region_text': 'Медеуский р-н', 'region': 0.69, 'year': 2023, 'ceiling': None, 'flat_priv_dorm': 0}]


In [22]:
df = pd.read_csv('links.csv')
df.shape

(59947, 1)

In [23]:
df = df.drop_duplicates()
df.shape

(36320, 1)

In [24]:
df.to_csv('last_links_of_ap.csv', index=False)

In [166]:
df = pd.read_csv('apartments_data.csv')

In [167]:
m = round(df['ceiling'].mean(),2)
df['ceiling'].fillna(m, inplace=True)
df

Unnamed: 0,price,room_count,quadrature,floor,region,year,flat_toilet,flat_balcony,flat_balcony_g,flat_door,inet_type,flat_parking,live_furniture,flat_flooring,ceiling
0,53000000,2,71.0,0.71,"Алматы, Ауэзовский р-н",2023,no_info,no_info,no_info,no_info,no_info,паркинг,no_info,no_info,2.7
1,27800000,2,43.0,0.25,"Алматы, Ауэзовский р-н",1962,no_info,no_info,no_info,металлическая,no_info,рядом охраняемая стоянка,частично,ламинат,2.77
2,46000000,3,80.7,0.46,"Алматы, Жетысуский р-н",2022,совмещенный,no_info,no_info,металлическая,no_info,паркинг,no_info,no_info,2.75
3,47500000,2,49.0,0.89,"Алматы, Медеуский р-н",1989,no_info,лоджия,да,металлическая,через TV кабель,no_info,полностью,ламинат,2.8
4,21500000,1,33.0,0.3,"Алматы, Ауэзовский р-н",2023,совмещенный,no_info,да,no_info,no_info,no_info,no_info,no_info,2.8
5,26000000,2,64.0,1.0,"Алматы, Жетысуский р-н",2008,no_info,балкон,no_info,деревянная,ADSL,no_info,полностью,ламинат,2.77
6,38500000,1,40.3,1.0,"Алматы, Бостандыкский р-н",2019,совмещенный,no_info,no_info,no_info,оптика,паркинг,полностью,ламинат,2.85
7,23500000,1,50.5,1.0,"Алматы, Наурызбайский р-н",2022,no_info,балкон,да,металлическая,no_info,рядом охраняемая стоянка,частично,ламинат,2.7
8,26000000,2,49.0,0.5,"Алматы, Алатауский р-н",2023,no_info,no_info,no_info,no_info,no_info,no_info,no_info,no_info,2.77
9,63000000,4,100.0,0.9,"Алматы, Бостандыкский р-н",1988,no_info,лоджия,да,металлическая,оптика,no_info,no_info,линолеум,2.77


In [175]:
one_hot_encoded = pd.get_dummies(df, columns=["flat_toilet", "flat_balcony", "flat_balcony_g", 
                                 "flat_door", "inet_type", "flat_parking", "live_furniture", 
                                 "flat_flooring"], 
                                 prefix=["flat_toilet", "flat_balcony", "flat_balcony_g", 
                                 "flat_door", "inet_type", "flat_parking", "live_furniture", 
                                 "flat_flooring"])
one_hot_encoded

Unnamed: 0,price,room_count,quadrature,floor,region,year,ceiling,flat_toilet_no_info,flat_toilet_совмещенный,flat_balcony_no_info,...,inet_type_через TV кабель,flat_parking_no_info,flat_parking_паркинг,flat_parking_рядом охраняемая стоянка,live_furniture_no_info,live_furniture_полностью,live_furniture_частично,flat_flooring_no_info,flat_flooring_ламинат,flat_flooring_линолеум
0,53000000,2,71.0,0.71,"Алматы, Ауэзовский р-н",2023,2.7,True,False,True,...,False,False,True,False,True,False,False,True,False,False
1,27800000,2,43.0,0.25,"Алматы, Ауэзовский р-н",1962,2.77,True,False,True,...,False,False,False,True,False,False,True,False,True,False
2,46000000,3,80.7,0.46,"Алматы, Жетысуский р-н",2022,2.75,False,True,True,...,False,False,True,False,True,False,False,True,False,False
3,47500000,2,49.0,0.89,"Алматы, Медеуский р-н",1989,2.8,True,False,False,...,True,True,False,False,False,True,False,False,True,False
4,21500000,1,33.0,0.3,"Алматы, Ауэзовский р-н",2023,2.8,False,True,True,...,False,True,False,False,True,False,False,True,False,False
5,26000000,2,64.0,1.0,"Алматы, Жетысуский р-н",2008,2.77,True,False,False,...,False,True,False,False,False,True,False,False,True,False
6,38500000,1,40.3,1.0,"Алматы, Бостандыкский р-н",2019,2.85,False,True,True,...,False,False,True,False,False,True,False,False,True,False
7,23500000,1,50.5,1.0,"Алматы, Наурызбайский р-н",2022,2.7,True,False,False,...,False,False,False,True,False,False,True,False,True,False
8,26000000,2,49.0,0.5,"Алматы, Алатауский р-н",2023,2.77,True,False,True,...,False,True,False,False,True,False,False,True,False,False
9,63000000,4,100.0,0.9,"Алматы, Бостандыкский р-н",1988,2.77,True,False,False,...,False,True,False,False,True,False,False,False,False,True


In [136]:
# data = []
# for link in apartment_links[0:1]:
#     try:
#         apartment_data = get_apartment_data(link)
#         # data.append(apartment_data)
#     except Exception as e:
#         print(f"Failed to extract data from {link}. Error: {e}")

In [44]:
# for link in apartment_links[0:2]:
#     try: 
#         url = link
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
        
#         # Locate the section containing "О квартире"
#         # This step might vary based on the actual webpage structure; this is just a hypothetical approach
#         # For this example, let's assume "О квартире" is an h2 heading. Adjust as necessary.
#         section = soup.find('div', class_='offer__parameters')
#         # print(section.contents)
#         if len(section.contents)==1:
#             parent_div = None  # Get the parent div containing this heading and the parameters
#         else:
#             parent_div = section
        
#         if parent_div:
#             # Extract all the dt elements within this section to get all the data-name attributes
#             dt_elements = parent_div.find_all('dt')
            
#             # List to store the data-name attributes
#             data_names = []
            
#             for dt in dt_elements:
#                 data_name = dt.get('data-name')
#                 if data_name:
#                     data_names.append(data_name)
            
#             print(data_names)  # This will print all the data-name attributes
#         else:
#             print("Section 'О квартире' not found.")
#     except Exception as e:
#         print(f"Failed to extract data from {link}. Error: {e}")

In [None]:
# from bs4 import BeautifulSoup
# import requests

# # Fetch the page content
# url = 'https://krisha.kz/a/show/687771355'
# response = requests.get(url)
# soup = BeautifulSoup(response.content, 'html.parser')

# # Define the desired data-name value
# desired_data_name = "flat.door"  # or "flat.balcony_g" or any other value

# # Find the dt element with the specified data-name
# dt_element = soup.find('dt', {"data-name": desired_data_name})

# # If the dt element is found, extract the dd value
# if dt_element:
#     dd_element = dt_element.find_next_sibling("dd")
#     if dd_element:
#         print(dd_element.get_text().strip())
# else:
#     print(f"No element with data-name='{desired_data_name}' found.")
