In [1]:
%pip install beautifulsoup4 pandas requests sqlalchemy psycopg2-binary --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests  # For making HTTP requests to websites
from bs4 import BeautifulSoup  # For parsing HTML content
import pandas as pd  # For creating and manipulating dataframes
from typing import List, Dict  # For type hints (makes code more readable)
import time  # For adding delays between requests (good web scraping etiquette)
import sqlalchemy  # For database interaction
from sqlalchemy import create_engine, Column, Integer, Float, String, DateTime, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import inspect
from datetime import datetime

In [3]:
def scrape_pages(start_page: int, end_page: int) -> pd.DataFrame:
    """
    Scrapes house listings from buyrentkenya.com across multiple pages.
    
    WHAT THIS FUNCTION DOES:
    - Loops through multiple pages of house listings
    - Extracts key information: title, price, location, bedrooms, bathrooms, size
    - Handles errors gracefully if a page fails to load
    - Returns all data in a pandas DataFrame
    
    Args:
        start_page: The first page number to scrape (e.g., 1)
        end_page: The last page number to scrape (e.g., 4)
    
    Returns:
        DataFrame containing all scraped property data
    
    Example:
        df = scrape_pages(1, 3)  # Scrapes pages 1, 2, and 3
    """
    
    # BASE URL: The main website address we're scraping
    # We'll add page numbers to this later (e.g., ?page=1, ?page=2)
    base_url = 'https://www.buyrentkenya.com/houses-for-sale'
    
    # HEADERS: Tell the website we're a real browser, not a bot
    # Websites often block requests without proper headers
    # User-Agent identifies what browser/device is making the request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    # PROPERTIES LIST: Will store all the house data we extract
    # Each house will be a dictionary added to this list
    properties = []
    
    # LOOP THROUGH PAGES: Iterate from start_page to end_page
    # range(1, 5) gives us [1, 2, 3, 4]
    for page_num in range(start_page, end_page + 1):
        
        # BUILD THE URL: Add page number to base URL
        # Example: 'https://www.buyrentkenya.com/houses-for-sale?page=1'
        url = f'{base_url}?page={page_num}'
        
        # LOG PROGRESS: Print which page we're currently scraping
        print(f"üîç Scraping page {page_num}: {url}")
        
        try:
            # MAKE HTTP REQUEST: Ask the website for the page content
            # timeout=10 means give up after 10 seconds if no response
            response = requests.get(url, headers=headers, timeout=10)
            
            # CHECK IF REQUEST WAS SUCCESSFUL
            # Status code 200 means "OK" - page loaded successfully
            # Other codes: 404 = Not Found, 500 = Server Error, etc.
            if response.status_code != 200:
                print(f"‚ö†Ô∏è  Failed to retrieve page {page_num}. "
                      f"Status code: {response.status_code}")
                continue  # Skip to next page
            
            # PARSE HTML: Convert raw HTML into a searchable structure
            # 'html.parser' is the parsing engine (built into Python)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # FIND ALL LISTINGS: Look for all house listing cards on the page
            # Each listing is in a <div> tag with class="listing-card"
            listings = soup.find_all('div', class_='listing-card')
            
            # LOG HOW MANY LISTINGS FOUND
            print(f"   ‚úì Found {len(listings)} listings on page {page_num}")
            
            # LOOP THROUGH EACH LISTING: Extract data from each house
            for listing in listings:
                
                # --- EXTRACT TITLE ---
                # Find the <h2> tag that contains the property title
                title_tag = listing.find('h2')
                # If tag exists, get text and remove extra spaces; otherwise use 'No title'
                title = title_tag.get_text(strip=True) if title_tag else 'No title'
                
                # --- EXTRACT PRICE ---
                # Find the <p> tag with specific class that contains the price
                price_tag = listing.find('p', class_='text-xl font-bold leading-7 text-grey-900')
                price = price_tag.get_text(strip=True) if price_tag else 'No price'
                
                # --- EXTRACT LOCATION ---
                # Find the <p> tag with specific class for location
                location_tag = listing.find('p', class_='ml-1 truncate text-sm font-normal capitalize text-grey-650')
                location = location_tag.get_text(strip=True) if location_tag else 'No location'
                
                # --- EXTRACT BEDROOMS, BATHROOMS, SIZE ---
                # These are in a scrollable list (swiper slides)
                # Initialize with 'N/A' in case they're not found
                bedrooms = bathrooms = size = 'N/A'
                
                # Find the div containing the scrollable list
                swiper_div = listing.find('div', class_='scrollable-list')
                
                if swiper_div:
                    # Find all individual slides within the scrollable list
                    slides = swiper_div.find_all('div', class_='swiper-slide')
                    
                    # Loop through each slide to identify what it contains
                    for slide in slides:
                        text = slide.get_text(strip=True)
                        
                        # Check what type of information this slide has
                        if 'Bedroom' in text:
                            bedrooms = text
                        elif 'Bathroom' in text:
                            bathrooms = text
                        elif 'm¬≤' in text:  # Square meters symbol
                            size = text
                
                # --- STORE THE DATA ---
                # Create a dictionary with all extracted information
                # Append it to our properties list
                properties.append({
                    'Title': title,
                    'Price': price,
                    'Location': location,
                    'Bedrooms': bedrooms,
                    'Bathrooms': bathrooms,
                    'Size': size
                })
            
            # BE POLITE: Wait 1 second before requesting next page
            # This prevents overwhelming the server (good web scraping etiquette)
            if page_num < end_page:  # Don't wait after the last page
                time.sleep(1)
        
        except requests.RequestException as e:
            # HANDLE ERRORS: If something goes wrong (no internet, timeout, etc.)
            print(f"‚ùå Error scraping page {page_num}: {str(e)}")
            continue  # Skip to next page
    
    # CONVERT TO DATAFRAME: Turn our list of dictionaries into a pandas DataFrame
    # DataFrames are like Excel spreadsheets - easy to analyze and manipulate
    df = pd.DataFrame(properties)
    
    # LOG FINAL RESULT
    print(f"\n‚úÖ Scraping complete! Total properties extracted: {len(df)}")
    
    return df

In [4]:
print("=" * 60)
print("STARTING WEB SCRAPING PROCESS")
print("=" * 60)

# Call the function and store results in df_all_pages
df_all_pages = scrape_pages(start_page=1, end_page=4)

# Display first 5 rows to verify data was scraped correctly
print("\n" + "=" * 60)
print("PREVIEW OF SCRAPED DATA")
print("=" * 60)
df_all_pages.head()

STARTING WEB SCRAPING PROCESS
üîç Scraping page 1: https://www.buyrentkenya.com/houses-for-sale?page=1
‚ö†Ô∏è  Failed to retrieve page 1. Status code: 404
üîç Scraping page 2: https://www.buyrentkenya.com/houses-for-sale?page=2
   ‚úì Found 25 listings on page 2
üîç Scraping page 3: https://www.buyrentkenya.com/houses-for-sale?page=3
   ‚úì Found 25 listings on page 3
üîç Scraping page 4: https://www.buyrentkenya.com/houses-for-sale?page=4
   ‚úì Found 25 listings on page 4

‚úÖ Scraping complete! Total properties extracted: 75

PREVIEW OF SCRAPED DATA


Unnamed: 0,Title,Price,Location,Bedrooms,Bathrooms,Size
0,3 Bed House with En Suite in Ruiru,"KSh 10,500,000",No location,3 Bedrooms,,
1,5 Bed House with En Suite at Ruiru,"KSh 22,000,000",No location,5 Bedrooms,6 Bathrooms,
2,4 Bed House with En Suite in Mtwapa,"KSh 85,000,000",No location,4 Bedrooms,5 Bathrooms,450 m¬≤
3,5 Bed Villa with En Suite in Lavington,"KSh 78,000,000",No location,5 Bedrooms,6 Bathrooms,
4,4 Bed House with En Suite in Loresho,"KSh 40,000,000",No location,4 Bedrooms,5 Bathrooms,


In [5]:
print("\nüìä DATASET INFORMATION:")
print(f"Total rows: {len(df_all_pages)}")
print(f"Total columns: {len(df_all_pages.columns)}")
print(f"\nColumn names: {list(df_all_pages.columns)}")
print(f"\nData types:\n{df_all_pages.dtypes}")
print(f"\nMissing values:\n{df_all_pages.isnull().sum()}")

# Show some sample data
print("\nüìã SAMPLE DATA:")
df_all_pages.head(10)


üìä DATASET INFORMATION:
Total rows: 75
Total columns: 6

Column names: ['Title', 'Price', 'Location', 'Bedrooms', 'Bathrooms', 'Size']

Data types:
Title        object
Price        object
Location     object
Bedrooms     object
Bathrooms    object
Size         object
dtype: object

Missing values:
Title        0
Price        0
Location     0
Bedrooms     0
Bathrooms    0
Size         0
dtype: int64

üìã SAMPLE DATA:


Unnamed: 0,Title,Price,Location,Bedrooms,Bathrooms,Size
0,3 Bed House with En Suite in Ruiru,"KSh 10,500,000",No location,3 Bedrooms,,
1,5 Bed House with En Suite at Ruiru,"KSh 22,000,000",No location,5 Bedrooms,6 Bathrooms,
2,4 Bed House with En Suite in Mtwapa,"KSh 85,000,000",No location,4 Bedrooms,5 Bathrooms,450 m¬≤
3,5 Bed Villa with En Suite in Lavington,"KSh 78,000,000",No location,5 Bedrooms,6 Bathrooms,
4,4 Bed House with En Suite in Loresho,"KSh 40,000,000",No location,4 Bedrooms,5 Bathrooms,
5,6 Bed House with En Suite in Gikambura,"KSh 25,000,000",No location,6 Bedrooms,,
6,5 Bed House with Staff Quarters in Muthaiga,"KSh 104,900,000",No location,5 Bedrooms,3 Bathrooms,
7,4 Bed House with En Suite in Ruiru,"KSh 14,500,000",No location,4 Bedrooms,5 Bathrooms,
8,4 Bed Villa with En Suite at Tigoni Limuru Kia...,"KSh 32,000,000",No location,4 Bedrooms,5 Bathrooms,
9,4 Bed House with En Suite at Mtwapa,"KSh 11,500,000",No location,4 Bedrooms,4 Bathrooms,


In [6]:
import re  # Regular expressions - for pattern matching and text extraction
import numpy as np  # Numerical operations and handling missing values

In [7]:
def clean_price(price_str: str) -> float:
    """
    Converts price string to numeric value.
    
    PROBLEM: Prices are strings like "KSh 20,000,000" or "No price"
    SOLUTION: Extract only the numbers and convert to float
    
    Examples:
        "KSh 20,000,000" ‚Üí 20000000.0
        "KSh 6,500,000" ‚Üí 6500000.0
        "No price" ‚Üí NaN (Not a Number)
        "N/A" ‚Üí NaN
    
    Args:
        price_str: The raw price string from website
    
    Returns:
        Numeric price value or NaN if invalid
    """
    
    # CHECK FOR INVALID DATA: If price is missing or invalid
    if not price_str or price_str in ['No price', 'N/A', '']:
        return np.nan  # Return NaN (pandas way of saying "missing data")
    
    try:
        # STEP 1: Remove "KSh" and any spaces
        # Example: "KSh 20,000,000" ‚Üí "20,000,000"
        clean_str = price_str.replace('KSh', '').strip()
        
        # STEP 2: Remove commas (used as thousands separators)
        # Example: "20,000,000" ‚Üí "20000000"
        clean_str = clean_str.replace(',', '')
        
        # STEP 3: Convert to float (decimal number)
        # Example: "20000000" ‚Üí 20000000.0
        return float(clean_str)
    
    except (ValueError, AttributeError):
        # If conversion fails (e.g., text that can't be converted to number)
        return np.nan

In [8]:
def extract_number_from_text(text: str) -> float:
    """
    Extracts the numeric value from text like "4 Bedrooms" or "5 Bathrooms".
    
    PROBLEM: Bedrooms/Bathrooms are strings like "4 Bedrooms"
    SOLUTION: Extract just the number part (returns float to allow NaN)
    
    Examples:
        "4 Bedrooms" ‚Üí 4.0
        "5 Bathrooms" ‚Üí 5.0
        "N/A" ‚Üí NaN
        "Studio" ‚Üí 0.0 (studios have 0 separate bedrooms)
    
    Args:
        text: The raw text containing a number
    
    Returns:
        The extracted number as float or NaN if not found
    """
    
    # CHECK FOR INVALID DATA
    if not text or text in ['N/A', 'No data', '']:
        return np.nan
    
    # SPECIAL CASE: Studio apartments (0 bedrooms)
    if 'studio' in text.lower():
        return 0.0
    
    try:
        # USE REGEX TO FIND NUMBERS
        # \d+ means "one or more digits"
        # re.search() finds the first match in the string
        match = re.search(r'\d+', text)
        
        if match:
            # match.group() returns the matched text
            # Example: from "4 Bedrooms", extracts "4"
            return float(match.group())
        else:
            return np.nan
    
    except (ValueError, AttributeError):
        return np.nan

In [9]:
def clean_size(size_str: str) -> float:
    """
    Extracts numeric size value from text like "150 m¬≤".
    
    PROBLEM: Size is string like "150 m¬≤" or "N/A"
    SOLUTION: Extract just the number (square meters)
    
    Examples:
        "150 m¬≤" ‚Üí 150.0
        "2,500 m¬≤" ‚Üí 2500.0
        "N/A" ‚Üí NaN
    
    Args:
        size_str: The raw size string
    
    Returns:
        Numeric size in square meters or NaN
    """
    
    # CHECK FOR INVALID DATA
    if not size_str or size_str in ['N/A', 'No size', '']:
        return np.nan
    
    try:
        # STEP 1: Remove "m¬≤" and any spaces
        clean_str = size_str.replace('m¬≤', '').replace('m2', '').strip()
        
        # STEP 2: Remove commas
        clean_str = clean_str.replace(',', '')
        
        # STEP 3: Convert to float
        return float(clean_str)
    
    except (ValueError, AttributeError):
        return np.nan

In [10]:
def clean_location(location_str: str) -> str:
    """
    Standardizes location text.
    
    PROBLEM: Locations might have inconsistent formatting
    SOLUTION: Clean up extra spaces, standardize capitalization
    
    Examples:
        "  Westlands  " ‚Üí "Westlands"
        "KAREN" ‚Üí "Karen"
    
    Args:
        location_str: The raw location string
    
    Returns:
        Cleaned location string
    """
    
    # CHECK FOR INVALID DATA
    if not location_str or location_str in ['No location', 'N/A', '']:
        return 'Unknown'
    
    # STEP 1: Remove extra spaces
    # STEP 2: Capitalize first letter of each word (title case)
    # Example: "westlands area" ‚Üí "Westlands Area"
    return location_str.strip().title()

In [11]:
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies all cleaning functions to the entire dataframe.
    
    This is the MAIN cleaning function that:
    1. Creates a copy of data (doesn't modify original)
    2. Applies all cleaning functions to appropriate columns
    3. Adds new cleaned columns
    4. Removes rows with critical missing data
    
    Args:
        df: Raw dataframe from web scraping
    
    Returns:
        Cleaned dataframe ready for database storage
    """
    
    print("\nüßπ STARTING DATA CLEANING PROCESS...")
    print("=" * 60)
    
    # CREATE A COPY: Don't modify the original dataframe
    df_clean = df.copy()
    
    # RECORD INITIAL SIZE
    initial_rows = len(df_clean)
    print(f"Initial number of records: {initial_rows}")
    
    # --- CLEAN PRICE COLUMN ---
    print("\n1Ô∏è‚É£ Cleaning Price column...")
    # Apply clean_price() function to every row in Price column
    df_clean['Price_Numeric'] = df_clean['Price'].apply(clean_price)
    # Count how many valid prices we have
    valid_prices = df_clean['Price_Numeric'].notna().sum()
    print(f"   ‚úì Converted {valid_prices}/{initial_rows} prices to numeric")
    
    # --- CLEAN BEDROOMS COLUMN ---
    print("\n2Ô∏è‚É£ Cleaning Bedrooms column...")
    df_clean['Bedrooms_Numeric'] = df_clean['Bedrooms'].apply(extract_number_from_text)
    valid_bedrooms = df_clean['Bedrooms_Numeric'].notna().sum()
    print(f"   ‚úì Extracted {valid_bedrooms}/{initial_rows} bedroom counts")
    
    # --- CLEAN BATHROOMS COLUMN ---
    print("\n3Ô∏è‚É£ Cleaning Bathrooms column...")
    df_clean['Bathrooms_Numeric'] = df_clean['Bathrooms'].apply(extract_number_from_text)
    valid_bathrooms = df_clean['Bathrooms_Numeric'].notna().sum()
    print(f"   ‚úì Extracted {valid_bathrooms}/{initial_rows} bathroom counts")
    
    # --- CLEAN SIZE COLUMN ---
    print("\n4Ô∏è‚É£ Cleaning Size column...")
    df_clean['Size_SqM'] = df_clean['Size'].apply(clean_size)
    valid_sizes = df_clean['Size_SqM'].notna().sum()
    print(f"   ‚úì Extracted {valid_sizes}/{initial_rows} size values")
    
    # --- CLEAN LOCATION COLUMN ---
    print("\n5Ô∏è‚É£ Cleaning Location column...")
    df_clean['Location_Clean'] = df_clean['Location'].apply(clean_location)
    print(f"   ‚úì Standardized all location names")
    
    # --- REMOVE ROWS WITH MISSING CRITICAL DATA ---
    print("\n6Ô∏è‚É£ Removing incomplete records...")
    # We consider Price and Bedrooms as CRITICAL fields
    # If either is missing, the record is not useful
    df_clean = df_clean.dropna(subset=['Price_Numeric', 'Bedrooms_Numeric'])
    final_rows = len(df_clean)
    removed_rows = initial_rows - final_rows
    print(f"   ‚úì Removed {removed_rows} records with missing critical data")
    print(f"   ‚úì Final dataset: {final_rows} records")
    
    # --- ADD METADATA COLUMNS ---
    print("\n7Ô∏è‚É£ Adding metadata columns...")
    # Add timestamp of when data was scraped
    df_clean['Scraped_Date'] = pd.Timestamp.now()
    # Add data source identifier
    df_clean['Source'] = 'buyrentkenya.com'
    print(f"   ‚úì Added Scraped_Date and Source columns")
    
    print("\n" + "=" * 60)
    print("‚úÖ DATA CLEANING COMPLETE!")
    print("=" * 60)
    
    return df_clean

In [12]:
df_cleaned = clean_dataframe(df_all_pages)

# Display the cleaned data
print("\nüìä CLEANED DATA PREVIEW:")
print("\nFirst 5 rows:")
df_cleaned.head()


üßπ STARTING DATA CLEANING PROCESS...
Initial number of records: 75

1Ô∏è‚É£ Cleaning Price column...
   ‚úì Converted 74/75 prices to numeric

2Ô∏è‚É£ Cleaning Bedrooms column...
   ‚úì Extracted 75/75 bedroom counts

3Ô∏è‚É£ Cleaning Bathrooms column...
   ‚úì Extracted 69/75 bathroom counts

4Ô∏è‚É£ Cleaning Size column...
   ‚úì Extracted 18/75 size values

5Ô∏è‚É£ Cleaning Location column...
   ‚úì Standardized all location names

6Ô∏è‚É£ Removing incomplete records...
   ‚úì Removed 1 records with missing critical data
   ‚úì Final dataset: 74 records

7Ô∏è‚É£ Adding metadata columns...
   ‚úì Added Scraped_Date and Source columns

‚úÖ DATA CLEANING COMPLETE!

üìä CLEANED DATA PREVIEW:

First 5 rows:


Unnamed: 0,Title,Price,Location,Bedrooms,Bathrooms,Size,Price_Numeric,Bedrooms_Numeric,Bathrooms_Numeric,Size_SqM,Location_Clean,Scraped_Date,Source
0,3 Bed House with En Suite in Ruiru,"KSh 10,500,000",No location,3 Bedrooms,,,10500000.0,3.0,,,Unknown,2025-12-07 18:45:56.670087,buyrentkenya.com
1,5 Bed House with En Suite at Ruiru,"KSh 22,000,000",No location,5 Bedrooms,6 Bathrooms,,22000000.0,5.0,6.0,,Unknown,2025-12-07 18:45:56.670087,buyrentkenya.com
2,4 Bed House with En Suite in Mtwapa,"KSh 85,000,000",No location,4 Bedrooms,5 Bathrooms,450 m¬≤,85000000.0,4.0,5.0,450.0,Unknown,2025-12-07 18:45:56.670087,buyrentkenya.com
3,5 Bed Villa with En Suite in Lavington,"KSh 78,000,000",No location,5 Bedrooms,6 Bathrooms,,78000000.0,5.0,6.0,,Unknown,2025-12-07 18:45:56.670087,buyrentkenya.com
4,4 Bed House with En Suite in Loresho,"KSh 40,000,000",No location,4 Bedrooms,5 Bathrooms,,40000000.0,4.0,5.0,,Unknown,2025-12-07 18:45:56.670087,buyrentkenya.com


In [13]:
print("\nüîç COMPARISON: RAW vs CLEANED DATA")
print("=" * 60)

print("\nüìã RAW DATA (first row):")
print(df_all_pages.iloc[0])

print("\n\n‚ú® CLEANED DATA (first row):")
print(df_cleaned.iloc[0])

print("\n\nüìà DATA QUALITY SUMMARY:")
print(f"Raw data columns: {len(df_all_pages.columns)}")
print(f"Cleaned data columns: {len(df_cleaned.columns)}")
print(f"\nNew columns added: {set(df_cleaned.columns) - set(df_all_pages.columns)}")


üîç COMPARISON: RAW vs CLEANED DATA

üìã RAW DATA (first row):
Title        3 Bed House with En Suite in Ruiru
Price                            KSh 10,500,000
Location                            No location
Bedrooms                             3 Bedrooms
Bathrooms                                   N/A
Size                                        N/A
Name: 0, dtype: object


‚ú® CLEANED DATA (first row):
Title                3 Bed House with En Suite in Ruiru
Price                                    KSh 10,500,000
Location                                    No location
Bedrooms                                     3 Bedrooms
Bathrooms                                           N/A
Size                                                N/A
Price_Numeric                                10500000.0
Bedrooms_Numeric                                    3.0
Bathrooms_Numeric                                   NaN
Size_SqM                                            NaN
Location_Clean                 

In [None]:
"""
DATABASE CONNECTION STRING FORMAT:
postgresql://username:password@host:port/database_name

Let's break this down:
- postgresql:// ‚Üí The database type (PostgreSQL)
- username ‚Üí Your PostgreSQL username (e.g., 'postgres')
- password ‚Üí Your PostgreSQL password (e.g., '1234')
- host ‚Üí Where database is running (e.g., 'localhost' for your computer)
- port ‚Üí PostgreSQL port (default is 5432)
- database_name ‚Üí Name of the database (e.g., 'house_prices')

EXAMPLE:
postgresql://postgres:1234@localhost:5432/house_prices

SECURITY NOTE: Never hardcode passwords in production code!
For learning purposes, it's okay, but in real projects use environment variables.
"""

# Let's define our connection parameters separately (easier to modify)
DB_USERNAME = 'postgres'      # Change this to your PostgreSQL username
DB_PASSWORD = '7510'          # Change this to your PostgreSQL password
DB_HOST = 'localhost'         # 'localhost' means database is on your computer
DB_PORT = '5432'              # Default PostgreSQL port
DB_NAME = 'house_prices'      # Name of database (we'll create this)

# Build the connection string
DATABASE_URL = f"postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

print("üîó Database Connection String Built:")
print(f"   Connecting to: {DB_HOST}:{DB_PORT}")
print(f"   Database: {DB_NAME}")
print(f"   Username: {DB_USERNAME}")
print("   (Password hidden for security)")

In [None]:
"""
WHAT IS AN ENGINE?
Think of the engine as a "phone line" to your database.
- It manages connections
- It translates Python commands to SQL
- It handles the communication between Python and PostgreSQL

echo=True: Shows SQL commands being executed (helpful for learning!)
"""

print("\nüîß Creating Database Engine...")

try:
    # Create the engine
    engine = create_engine(DATABASE_URL, echo=True)
    
    # Test the connection
    with engine.connect() as connection:
        print("‚úÖ Successfully connected to PostgreSQL database!")
        
except Exception as e:
    print(f"‚ùå Failed to connect to database: {e}")
    print("\nüí° TROUBLESHOOTING TIPS:")
    print("   1. Is PostgreSQL running? Check with: sudo service postgresql status")
    print("   2. Does the database exist? Create it with: createdb house_prices")
    print("   3. Are username/password correct?")
    print("   4. Is the port correct? (Default is 5432)")

In [None]:
"""
WHAT IS A SCHEMA?
A schema is like a blueprint for a table. It defines:
- What columns exist
- What type of data each column holds (Integer, String, Float, etc.)
- Which columns are required vs optional
- Which column is the primary key (unique identifier)

We use SQLAlchemy ORM (Object-Relational Mapping):
- We define tables as Python classes
- SQLAlchemy converts them to actual SQL tables
"""

# Create base class for all our models
Base = declarative_base()

class HouseProperty(Base):
    """
    This class represents the 'properties' table in our database.
    Each attribute becomes a column in the table.
    """
    
    # TABLE NAME: What the table will be called in PostgreSQL
    __tablename__ = 'properties'
    
    # --- DEFINE COLUMNS ---
    
    # PRIMARY KEY: Unique identifier for each property
    # autoincrement=True means PostgreSQL automatically assigns numbers (1, 2, 3, ...)
    id = Column(Integer, primary_key=True, autoincrement=True)
    
    # PROPERTY DETAILS (Original Text Data)
    title = Column(Text, nullable=False)  # nullable=False means this field is REQUIRED
    price_text = Column(String(50))       # Store original price string
    location = Column(String(200))
    bedrooms_text = Column(String(50))
    bathrooms_text = Column(String(50))
    size_text = Column(String(50))
    
    # CLEANED NUMERIC DATA (For Analysis)
    price_numeric = Column(Float)         # Cleaned price as number
    bedrooms_numeric = Column(Integer)    # Number of bedrooms
    bathrooms_numeric = Column(Integer)   # Number of bathrooms
    size_sqm = Column(Float)             # Size in square meters
    location_clean = Column(String(200))
    
    # METADATA (Tracking Information)
    source = Column(String(100))          # Which website (e.g., 'buyrentkenya.com')
    scraped_date = Column(DateTime)       # When was this data collected
    inserted_date = Column(DateTime, default=datetime.utcnow)  # When added to database
    
    def __repr__(self):
        """
        This defines how the object is displayed when you print it.
        Helpful for debugging!
        """
        return f"<Property(id={self.id}, title='{self.title[:30]}...', price={self.price_numeric})>"

print("\nüìã Database Schema Defined:")
print(f"   Table Name: {HouseProperty.__tablename__}")
print(f"   Columns: {len(HouseProperty.__table__.columns)}")
print("\n   Column Details:")
for column in HouseProperty.__table__.columns:
    print(f"      - {column.name}: {column.type}")

In [None]:
"""
This step actually creates the table in your PostgreSQL database.
If the table already exists, it won't create it again (safe to run multiple times).
"""

print("\nüèóÔ∏è  Creating table in database...")

try:
    # Create all tables defined in Base
    Base.metadata.create_all(engine)
    print("‚úÖ Table 'properties' created successfully!")
    
    # Verify table exists
    inspector = inspect(engine)
    if 'properties' in inspector.get_table_names():
        print("‚úÖ Verified: Table exists in database")
        
        # Show table structure
        columns = inspector.get_columns('properties')
        print(f"\n   Table has {len(columns)} columns:")
        for col in columns:
            print(f"      - {col['name']}: {col['type']}")
    
except Exception as e:
    print(f"‚ùå Error creating table: {e}")

In [None]:
"""
We need to convert our pandas DataFrame to a format SQLAlchemy understands.
We'll map DataFrame columns to our table columns.
"""

def prepare_data_for_db(df: pd.DataFrame) -> list:
    """
    Converts DataFrame rows to HouseProperty objects.
    
    WHY DO THIS?
    - SQLAlchemy works with Python objects, not DataFrames
    - Each row becomes a HouseProperty instance
    - This gives us more control and type safety
    
    Args:
        df: Cleaned DataFrame from our scraping
    
    Returns:
        List of HouseProperty objects ready for insertion
    """
    
    print(f"\nüì¶ Preparing {len(df)} records for database insertion...")
    
    property_objects = []
    
    for index, row in df.iterrows():
        """
        iterrows() loops through each row in the DataFrame
        index: Row number (0, 1, 2, ...)
        row: The actual data in that row
        """
        
        # Create a HouseProperty object for this row
        property_obj = HouseProperty(
            # Original text data
            title=row['Title'],
            price_text=row['Price'],
            location=row['Location'],
            bedrooms_text=row['Bedrooms'],
            bathrooms_text=row['Bathrooms'],
            size_text=row['Size'],
            
            # Cleaned numeric data
            price_numeric=row['Price_Numeric'] if pd.notna(row['Price_Numeric']) else None,
            bedrooms_numeric=int(row['Bedrooms_Numeric']) if pd.notna(row['Bedrooms_Numeric']) else None,
            bathrooms_numeric=int(row['Bathrooms_Numeric']) if pd.notna(row['Bathrooms_Numeric']) else None,
            size_sqm=row['Size_SqM'] if pd.notna(row['Size_SqM']) else None,
            location_clean=row['Location_Clean'],
            
            # Metadata
            source=row['Source'],
            scraped_date=row['Scraped_Date']
        )
        
        property_objects.append(property_obj)
    
    print(f"‚úÖ Prepared {len(property_objects)} property objects")
    
    return property_objects

# Convert our cleaned data to database objects
property_records = prepare_data_for_db(df_cleaned)

# Show example of first record
print("\nüìã Example Property Object:")
print(property_records[0])

In [None]:
"""
UNDERSTANDING SESSIONS:
A session is like a "workspace" for database operations.
- You add/modify data in the session
- session.commit() saves all changes to database
- session.rollback() cancels all changes if something goes wrong

WHY USE SESSIONS?
- Safety: If something fails, changes aren't saved
- Efficiency: Multiple operations batched together
- Transactions: All-or-nothing (either all records saved or none)
"""

def insert_data_to_db(property_objects: list, engine):
    """
    Inserts property data into the database.
    
    Args:
        property_objects: List of HouseProperty objects
        engine: Database engine
    
    Returns:
        Number of records successfully inserted
    """
    
    # Create a session factory
    Session = sessionmaker(bind=engine)
    session = Session()
    
    print(f"\nüíæ Inserting {len(property_objects)} records into database...")
    
    try:
        # Add all objects to session
        session.add_all(property_objects)
        
        # Commit transaction (save to database)
        session.commit()
        
        print(f"‚úÖ Successfully inserted {len(property_objects)} records!")
        
        return len(property_objects)
    
    except Exception as e:
        # If anything goes wrong, rollback (undo changes)
        session.rollback()
        print(f"‚ùå Error inserting data: {e}")
        return 0
    
    finally:
        # Always close the session when done
        session.close()

# Insert the data
records_inserted = insert_data_to_db(property_records, engine)

print(f"\nüìä DATABASE INSERTION SUMMARY:")
print(f"   Total records processed: {len(df_cleaned)}")
print(f"   Successfully inserted: {records_inserted}")

In [None]:
"""
Let's read data back from the database to verify it was stored correctly.
This is like a "sanity check" - did everything work?
"""

print("\nüîç VERIFYING DATA IN DATABASE...")
print("=" * 60)

# Query: Read first 5 records from database
query = """
    SELECT 
        id,
        title,
        price_numeric,
        bedrooms_numeric,
        bathrooms_numeric,
        location_clean,
        scraped_date
    FROM properties
    ORDER BY id
    LIMIT 5
"""

# Execute query and load into DataFrame
df_from_db = pd.read_sql(query, engine)

print(f"\n‚úÖ Successfully read {len(df_from_db)} records from database")
print("\nüìã SAMPLE DATA FROM DATABASE:")
print(df_from_db)

# Count total records in database
count_query = "SELECT COUNT(*) as total FROM properties"
total_count = pd.read_sql(count_query, engine)
print(f"\nüìà Total records in database: {total_count['total'].iloc[0]}")

In [None]:
"""
Let's get some basic statistics from our database.
This helps us understand the data we've collected.
"""

print("\nüìä DATABASE STATISTICS")
print("=" * 60)

# Average price
avg_price_query = """
    SELECT AVG(price_numeric) as avg_price
    FROM properties
    WHERE price_numeric IS NOT NULL
"""
avg_price = pd.read_sql(avg_price_query, engine)
print(f"\nüí∞ Average House Price: KSh {avg_price['avg_price'].iloc[0]:,.2f}")

# Price range
price_range_query = """
    SELECT 
        MIN(price_numeric) as min_price,
        MAX(price_numeric) as max_price
    FROM properties
    WHERE price_numeric IS NOT NULL
"""
price_range = pd.read_sql(price_range_query, engine)
print(f"   Cheapest: KSh {price_range['min_price'].iloc[0]:,.2f}")
print(f"   Most Expensive: KSh {price_range['max_price'].iloc[0]:,.2f}")

# Bedroom distribution
bedroom_dist_query = """
    SELECT 
        bedrooms_numeric,
        COUNT(*) as count
    FROM properties
    WHERE bedrooms_numeric IS NOT NULL
    GROUP BY bedrooms_numeric
    ORDER BY bedrooms_numeric
"""
bedroom_dist = pd.read_sql(bedroom_dist_query, engine)
print(f"\nüõèÔ∏è  Bedroom Distribution:")
for _, row in bedroom_dist.iterrows():
    print(f"   {row['bedrooms_numeric']} bedrooms: {row['count']} properties")

# Location distribution
location_dist_query = """
    SELECT 
        location_clean,
        COUNT(*) as count
    FROM properties
    GROUP BY location_clean
    ORDER BY count DESC
    LIMIT 5
"""
location_dist = pd.read_sql(location_dist_query, engine)
print(f"\nüìç Top 5 Locations:")
for _, row in location_dist.iterrows():
    print(f"   {row['location_clean']}: {row['count']} properties")

print("\n" + "=" * 60)
print("‚úÖ STEP 3: DATABASE STORAGE COMPLETE!")
print("=" * 60)