In [None]:
# Stage 1: Installing the necessary packages
!pip install cloudscraper
!pip install requests
!pip install requests-toolbelt
!pip install beautifulsoup4
!pip install selenium
!pip install pandas

In [3]:
# Stage 2: Importing all the necessary packages
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import tempfile

from bs4 import BeautifulSoup # To work with HTML

import time # Used for stopping the code for a bit

import cloudscraper # Collecting data from individual links
import json # Collecting data from json files
import pandas as pd # Putting this data in a dataframe

In [4]:
# Stage 3: Function for the link collector
def get_links(start_page, end_page):
    # Setting up the webdriver
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Simpler headless mode
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    chrome_options.add_argument("--disable-dev-shm-usage")  # Prevent memory issues
    chrome_options.add_argument(f"--user-data-dir={tempfile.mkdtemp()}")  # Unique temp profile
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_window_size(1280, 720)

    href_list = [] # initializing the link list
    hrefs = [] # Initializing the list used in the for loop

    for n in range(start_page, end_page+1): # Go over the number of pages
        print(f"fetching page {n}")
    # Opening the link with the webdriver
        driver.get(f"https://www.myhome.ge/s/iyideba-bina-Tbilisshi/?deal_types=1&real_estate_types=1&cities=1&urbans=38,39,40,41,42,43,44,45,46,47,101,28,30,48,106,111,121,29&districts=4&currency_id=1&CardView=2&page={n}")
        num_links = 0
        time_elapsed = 0
        timer_start = time.time()
        while num_links == 0 and time_elapsed < 30: # The link needs time to load, so we are retrying a few times until the link loads, and we get our data
            soup = BeautifulSoup(driver.page_source, 'html.parser') # Converting to beautiful soup to easily extract data
            links = soup.find_all('a', target="_blank") # Finding all <a></a>. Contains the links we need (And unnecessary stuff as well)
            hrefs = [link['href'] for link in links if 'href' in link.attrs and 'pr' in link['href']] # Only links (hrefs) with 'pr' are left
            num_links = len(hrefs) # How many links we managed to extract
            time_elapsed = time.time() - timer_start
        href_list.extend(hrefs) # Adding the links to the link list
        print(f"fetched page {n}. Remaining {end_page-n}")
        print(f"We currently have {len(href_list)} links")
        if time_elapsed > 29: # If you can't get links from a page after waiting 30 seconds, then it is the last page and we should stop
            print("We have reached the last available page")
            break
    return href_list

In [5]:
# Stage 4: Link scraper
def scrape_links(href_list):
    scraper = cloudscraper.create_scraper()
    print("Total links to scrape: ", len(href_list))
    link_num = 1
    error_count = 0
    df = 0
    for link in href_list:
        print(f"Scraping link {link_num} of {len(href_list)}")
        link_data = scraper.get(f"https://www.myhome.ge{link}").text
        soup = BeautifulSoup(link_data, 'html.parser')
        script_tag = soup.find('script', type='application/json')
        json_data = json.loads(script_tag.string)
        try:
              statement = json_data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data']['statement']
              row_data = {
                "price_total_1": statement["price"]["1"]["price_total"],
                "price_total_2": statement["price"]["2"]["price_total"],
                "price_total_3": statement["price"]["3"]["price_total"],
                "Area (sq m)": statement["area"],
                "area_type_id": statement["area_type_id"],
                "price_square_1": statement["price"]["1"]["price_square"],
                "price_square_2": statement["price"]["2"]["price_square"],
                "price_square_3": statement["price"]["3"]["price_square"],
                "owner_name": statement["owner_name"],
                "user_phone_number": statement["user_phone_number"],
                "ID": statement["id"],
                "dynamic_title": statement["dynamic_title"],
                "dynamic_slug": statement["dynamic_slug"],
                "created_at": statement["created_at"],
                "last_updated": statement["last_updated"],
                "views": statement["views"],
                "address": statement["address"],
                "Latitude": statement["lat"],
                "Longitude": statement["lng"],
                "point_coordinates": [str(statement["point"]["coordinates"])],
                "street_id": statement["street_id"],
                "City": statement["city_name"],
                "city_id": statement["city_id"],
                "District": statement["district_name"],
                "district_id": statement["district_id"],
                "Urban": statement["urban_name"],
                "urban_id": statement["urban_id"],
                "nearby_places": str(list(statement.get("nearby_places", {}).keys())) if statement.get("nearby_places") else "No nearby places",
                #"metro_station": statement["metro_station"],
                "metro_station_id": statement.get("metro_station")["id"] if statement.get("metro_station") else None,
                "metro_station_name": statement.get("metro_station")["display_name"] if statement.get("metro_station") else None,
                "room_type_id": statement["room_type_id"],
                "bedroom_type_id": statement["bedroom_type_id"],
                "Floor": statement["floor"],
                "Total Floors": statement["total_floors"],
                "comment": statement["comment"],
                "status_id": statement["status_id"],
                "condition": statement["condition"],
                "condition_id": statement["condition_id"],
                "project_type_id": statement["project_type_id"],
                "project_id": statement["project_id"],
                "project_uuid": statement["project_uuid"],
                "bathroom_type_id":statement["bathroom_type_id"],
                "build_year": statement["build_year"],
                "height": statement["height"],
                "heating_type_id": statement["heating_type_id"],
                "parking_type_id": statement["parking_type_id"],
                "hot_water_type": statement["hot_water_type"],
                "hot_water_type_id": statement["hot_water_type_id"],
                "material_type_id": statement["material_type_id"],
                "balconies": statement["balconies"],
                "balcony_area": statement["balcony_area"],
                "storeroom_type_id": statement["storeroom_type_id"],
                "storeroom_area": statement["storeroom_area"],
                "Parameter_ID": [str([param['id'] for param in statement['parameters']])],
                "Parameter_name": [str([param['key'] for param in statement['parameters']])],
                "Parameter_type": [str([param['type'] for param in statement['parameters']])],
                "deal_type_id": statement["deal_type_id"],
                "real_estate_type_id": statement["real_estate_type_id"],
                "yard_area": statement["yard_area"]}
              if df is 0: # On the first link create a new data frame
                  df = pd.DataFrame([row_data])
              else: # On the other links add to the previous data frame
                  new_df = pd.DataFrame([row_data])
                  df = pd.concat([df, new_df], ignore_index=True)
              print(f"Link {link_num} data has been stored")
        except IndexError:
            print("No data found")
            error_count += 1
            continue
        link_num += 1
    print(error_count, "links had errors")
    return df


  if df is 0: # On the first link create a new data frame


In [6]:
# Stage 5: Scraping and creating a dataset
df = scrape_links(get_links(1,2500)) # The upper limit can be high, as the code will stop on its own
df.to_csv('data.csv', index=False,encoding='utf-8-sig')

fetching page 1
fetched page 1. Remaining 2499
We currently have 20 links
fetching page 2
fetched page 2. Remaining 2498
We currently have 40 links
fetching page 3
fetched page 3. Remaining 2497
We currently have 60 links
fetching page 4
fetched page 4. Remaining 2496
We currently have 80 links
fetching page 5
fetched page 5. Remaining 2495
We currently have 100 links
fetching page 6
fetched page 6. Remaining 2494
We currently have 120 links
fetching page 7
fetched page 7. Remaining 2493
We currently have 140 links
fetching page 8
fetched page 8. Remaining 2492
We currently have 160 links
fetching page 9
fetched page 9. Remaining 2491
We currently have 180 links
fetching page 10
fetched page 10. Remaining 2490
We currently have 200 links
fetching page 11
fetched page 11. Remaining 2489
We currently have 220 links
fetching page 12
fetched page 12. Remaining 2488
We currently have 240 links
fetching page 13
fetched page 13. Remaining 2487
We currently have 260 links
fetching page 14
fetc

In [7]:
# Optional stage: Looking at the data
print(f"The data frame has {len(df)} rows")
print(f"The data frame has {len(df.columns)} columns")
df.head()

The data frame has 33081 rows
The data frame has 59 columns


Unnamed: 0,price_total_1,price_total_2,price_total_3,Area (sq m),area_type_id,price_square_1,price_square_2,price_square_3,owner_name,user_phone_number,...,balconies,balcony_area,storeroom_type_id,storeroom_area,Parameter_ID,Parameter_name,Parameter_type,deal_type_id,real_estate_type_id,yard_area
0,269418,97000,89252,75.0,1,3592,1293,1190,მაია,577151***,...,1.0,76.0,,,"[[2, 3, 1, 10, 6, 8, 43, 44, 45, 46, 19, 21, 2...","[['internet', 'tv', 'gas', 'furniture', 'eleva...","[['parameter', 'parameter', 'parameter', 'furn...",1,1,
1,555500,200000,184026,67.0,1,8291,2985,2747,გიორგი,511220***,...,,,,,"[[2, 3, 10, 6, 8, 43, 44, 45, 46, 19, 20, 24, ...","[['internet', 'tv', 'furniture', 'elevator', '...","[['parameter', 'parameter', 'furniture-equipme...",1,1,
2,277750,100000,92013,55.0,1,5050,1818,1673,გიორგი,511220***,...,,,,,"[[2, 3, 10, 6, 8, 43, 44, 45, 46, 19, 20, 24, ...","[['internet', 'tv', 'furniture', 'elevator', '...","[['parameter', 'parameter', 'furniture-equipme...",1,1,
3,269279,96950,89206,64.0,1,4207,1515,1394,ნიკოლოზი,555276***,...,1.0,1.0,5.0,1.0,"[[2, 3, 1, 10, 32, 6, 8, 34, 35, 43, 9, 40, 13...","[['internet', 'tv', 'gas', 'furniture', 'spa',...","[['parameter', 'parameter', 'parameter', 'furn...",1,1,
4,486063,175000,161022,102.0,1,4765,1716,1579,Natia,599776***,...,,,,,[[]],[[]],[[]],1,1,
