### **Import Packages**

In [4]:
# Import Packages
from bs4 import BeautifulSoup
from datetime import datetime 
import requests   
import pandas as pd 
import numpy as np
from tqdm import trange
pd.set_option('display.max_rows',1000)
import time
import random
from google.cloud import bigquery

### **Initialize Client**

In [5]:
# initialize client object
client = bigquery.Client()

### **Scrape Real Estate Listings in Nairobi, Kenya**

#### **1.1 Apartments for Rent in Nairobi**

In [7]:
sort_orders = [
    'propertytypes=houses,apartments-flats,townhouses',
    'propertytypes=houses,apartments-flats,townhouses&SortOrder=PriceDescending',
    'propertytypes=houses,apartments-flats,townhouses&SortOrder=AgeAscending',
    'propertytypes=houses,apartments-flats,townhouses&SortOrder=PropertyType',
    'propertytypes=houses,apartments-flats,townhouses&SortOrder=SizeAscending'
]

In [None]:
# Initialize DataFrame outside the loop
bigdata = pd.DataFrame()
selected_order = random.choice(sort_orders)

try:
    for page in trange(1,1500):
        counties = ['nairobi']
        furnished = ['true','false']
        rental_rates = ['day','month','week','year']
        
        for county in counties:
            for furnish in furnished:
                for rate in rental_rates:
                    
                    # specify website url 
                    url = 'https://www.property24.co.ke/property-to-rent-in-'+str(county)+'-p95?rentalterm='+str(rate)+'&isfurnished='+str(furnish)+'&' + str(selected_order) + '&Page=' + str(page)
                    req = requests.get(url, timeout=3600).text 
                    soup = BeautifulSoup(req,'lxml')
                    listings = soup.find_all('span',class_='p24_content')

                    for listing in listings:
                        try:
                            property_title = listing.find('span',class_='p24_propertyTitle').text.strip()
                            property_description = listing.find('span', class_='p24_excerpt').text.strip()
                            feature_details = listing.find_all('span', class_='p24_featureDetails')
                            features = str({feature['title']: feature.text.strip() for feature in feature_details if 'title' in feature.attrs})
                            property_availability = 'For Rent'
                            property_location = listing.find('span',class_='p24_location').text.strip()
                            property_address = listing.find('span',class_='p24_address').text.strip()
                            try:
                                floor_size = listing.find('span', class_='p24_size', title='Floor Size').text.strip()
                            except Exception as e:
                                floor_size = np.NAN
                            furnished = furnish
                            rental_rate = rate
                            property_price = listing.find('span',class_='p24_price').text.strip()
                            last_scraped = datetime.now()
                            
                            # Create DataFrame
                            data = pd.DataFrame({
                                'county':[county],
                                'property_title':[property_title],
                                'property_description':[property_description],
                                'features':[features],
                                'property_availability':[property_availability],
                                'property_location':[property_location],
                                'property_address':[property_address],
                                'floor_size':[floor_size],
                                'furnished':[furnished],
                                'rental_rate':[rental_rate],
                                'property_price':[property_price],
                                'last_scraped':[last_scraped]
                            })

                            # Append data to bigdata DataFrame
                            bigdata = pd.concat([bigdata,data],ignore_index=True)
                            
                        except Exception as e:
                            pass            

    # Handle Database Import Error
    table_id = 'project-adrian-julius-aluoch.cronjobs.real_estate_data'
    job = client.load_table_from_dataframe(bigdata,table_id)
    while job.state != 'DONE':
        time.sleep(4)
        job.reload()
        print(f"Data Upload Status : {job.state}")

except Exception as e:
    pass

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:51<00:00, 51.25s/it]


#### **1.2 Apartmensts for Sale in Nairobi**

In [None]:
# Initialize DataFrame outside the loop
bigdata = pd.DataFrame()
selected_order = random.choice(sort_orders)

try:
    for page in trange(1,1500):
        counties = ['nairobi']
        
        for county in counties:
            # specify website url 
            url = 'https://www.property24.co.ke/property-for-sale-in-'+str(county)+'-p95?' + str(selected_order) + '&Page=' + str(page)
            req = requests.get(url, timeout=3600).text 
            soup = BeautifulSoup(req,'lxml')
            listings = soup.find_all('span',class_='p24_content')

            for listing in listings:
                try:
                    property_title = listing.find('span',class_='p24_propertyTitle').text.strip()
                    property_description = listing.find('span', class_='p24_excerpt').text.strip()
                    feature_details = listing.find_all('span', class_='p24_featureDetails')
                    features = str({feature['title']: feature.text.strip() for feature in feature_details if 'title' in feature.attrs})
                    property_availability = 'For Sale'
                    property_location = listing.find('span',class_='p24_location').text.strip()
                    property_address = listing.find('span',class_='p24_address').text.strip()
                    try:
                        floor_size = listing.find('span',class_='p24_size').text.strip()
                    except Exception as e:
                        floor_size = np.NAN
                    property_price = listing.find('span',class_='p24_price').text.strip()
                    rental_rate = np.NAN
                    furnished = np.NAN
                    last_scraped = datetime.now()
                    
                    # Create DataFrame
                    data = pd.DataFrame({
                        'county':[county],
                        'property_title':[property_title],
                        'property_description':[property_description],
                        'features':[features],
                        'property_availability':[property_availability],
                        'property_location':[property_location],
                        'property_address':[property_address],
                        'floor_size':[floor_size],
                        'furnished':[furnished],
                        'rental_rate':[rental_rate],
                        'property_price':[property_price],
                        'last_scraped':[last_scraped]
                    })

                    # Append data to bigdata DataFrame
                    bigdata = pd.concat([bigdata,data],ignore_index=True)
                    
                except Exception as e:
                    pass   

    # Handle Database Import Error
    table_id = 'project-adrian-julius-aluoch.cronjobs.real_estate_data'
    job = client.load_table_from_dataframe(bigdata,table_id)
    while job.state != 'DONE':
        time.sleep(4)
        job.reload()
        print(f"Data Upload Status : {job.state}")

except Exception as e:
    pass

100%|██████████| 1/1 [00:03<00:00,  3.51s/it]


Data Upload Status : DONE


### **Basic Data Cleaning**

In [63]:
# Define SQL Query to Retrieve Real Estate Data from Google Cloud BigQuery
sql = (
       'SELECT *'
       'FROM `cronjobs.real_estate_data`'
       )

# Run SQL Query
data = client.query(sql, timeout=3600).to_dataframe()
print(f'Rows of Real Estate Data in Google BigQuery : {data.shape[0]:,.0f}\nCols of Real Estate Data in Google BigQuery : {data.shape[1]:,.0f}')

Rows of Real Estate Data in Google BigQuery : 68
Cols of Real Estate Data in Google BigQuery : 12


In [None]:
# Check Total Number of Duplicate Records
data['features'] = data['features'].astype(str)
data['last_scraped'] = pd.to_datetime(data['last_scraped'], unit = 'ns')

duplicated = data.duplicated(subset=[
       'county', 'property_title', 'property_description', 'features', 'property_availability',
       'property_location', 'property_address', 'floor_size', 
       'furnished', 'rental_rate', 'property_price'
                                    ]).sum()

# Remove Duplicate Records
data.drop_duplicates(subset=[
        'county', 'property_title', 'property_description', 'features', 'property_availability',
       'property_location', 'property_address', 'floor_size', 
       'furnished', 'rental_rate', 'property_price'
                            ],inplace=True)

# Display Initial & Final Number of Duplicate Records
print(f"Initial Shape of Dataset : {data.shape}\nTotal Duplicate Records : {duplicated:,.0f}\nFinal Shape of Dataset : {data.shape}")

Initial Shape of Dataset : (68, 12)
Total Duplicate Records : 0
Final Shape of Dataset : (68, 12)


In [62]:
# Drop Original Real Estate Table 
table_id = 'project-adrian-julius-aluoch.cronjobs.real_estate_data'
client.delete_table(table_id)

# Upload Final Real Estate Table
job = client.load_table_from_dataframe(data,table_id)
while job.state != 'DONE':
    time.sleep(1)
    job.reload()
    print(f'Real Estate Data Update : {job.state}')

Real Estate Data Update : RUNNING
Real Estate Data Update : DONE
