<a href="https://colab.research.google.com/github/cshravankumar/Rentals/blob/main/Newport.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
auth.authenticate_user()
!pip install selenium-wire
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import csv
import re

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Setup WebDriver
driver = webdriver.Chrome(options=chrome_options)

# Open the webpage
url = "https://www.newportrentals.com/apartments-jersey-city-for-rent/"
driver.get(url)

# Function to scroll and load all apartments
def load_all_apartments():
    while True:
        try:
            load_more = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "a.button.button--hollow.button--load-more-units"))
            )
            driver.execute_script("arguments[0].click();", load_more)
            time.sleep(2)  # Wait for new apartments to load
        except TimeoutException:
            break  # No more "Load More" button, all apartments loaded

from google.cloud import storage
from datetime import datetime

def write_to_gcs(bucket_name, apartment_data):
    """Writes apartment data to a CSV file in Google Cloud Storage."""

# Get the current date in YYYY-MM-DD format
    current_date = datetime.now().strftime("%Y-%m-%d")

# Create the filename with the date
    file_name = f"apartments_{current_date}.csv"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)

    with blob.open("w", newline='', encoding='utf-8') as csvfile:
        # Update fieldnames to include all keys from apartment_data
        fieldnames = ['Building', 'Unit', 'Address', 'Bedrooms', 'Bathrooms', 'Square Feet', 'Rent', 'Availability']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(apartment_data)



# Load all apartments
load_all_apartments()

# Wait for the page to load
time.sleep(10)


# Print the page title
print(f"Page title: {driver.title}")

# Find apartment elements
apartments = driver.find_elements(By.CSS_SELECTOR, "div.unit-list-item")
print(f"\nFound {len(apartments)} apartments")

# Function to parse apartment text
def parse_apartment_text(text):
    lines = text.split('\n')
    building = lines[0]
    unit = lines[1]
    address = lines[2]
    bedroom = re.search(r'(\d+)\s*Bedroom', text)
    bedroom = bedroom.group(1) if bedroom else "Studio" if "Studio" in text else "Not found"
    bathroom = re.search(r'(\d+)\s*Bathroom', text)
    bathroom = bathroom.group(1) if bathroom else "Not found"
    sqft = re.search(r'(\d+)\s*Sq\s*Ft', text)
    sqft = sqft.group(1) if sqft else "Not found"
    price = re.search(r'\$\d+(,\d+)?', text)
    price = price.group(0) if price else "Not found"
    availability = lines[-1] if "Available" in lines[-1] else "Not found"
    return building, unit, address, bedroom, bathroom, sqft, price, availability

# Extract data
apartment_data = []
for i, apt in enumerate(apartments, 1):  # Process first 5 apartments for brevity
    print(f"\nApartment {i}:")

    apt_text = apt.text
    #print(f"Raw text: {apt_text}")

    building, unit, address, bedroom, bathroom, sqft, price, availability = parse_apartment_text(apt_text)

   # print(f"Building: {building}")
   # print(f"Unit: {unit}")
   # print(f"Address: {address}")
   # print(f"Bedrooms: {bedroom}")
   # print(f"Bathrooms: {bathroom}")
   # print(f"Square Feet: {sqft}")
   # print(f"Rent: {price}")
   # print(f"Availability: {availability}")

    apartment_data.append({
        'Building': building,
        'Unit': unit,
        'Address': address,
        'Bedrooms': bedroom,
        'Bathrooms': bathroom,
        'Square Feet': sqft,
        'Rent': price,
        'Availability': availability
    })

# Save data to CSV (only if apartments were found)
if apartment_data:
    csv_filename = 'apartments.csv'
    csv_headers = ['Building', 'Unit', 'Address', 'Bedrooms', 'Bathrooms', 'Square Feet', 'Rent', 'Availability']

    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
        writer.writeheader()
        for apartment in apartment_data:
            writer.writerow(apartment)

    print(f"\nData saved to {csv_filename}")

    # Display the first few rows of the CSV
    !head -n 5 {csv_filename}
else:
    print("\nNo apartment data was collected.")

bucket_name = "apartmentrentals-bucketsampling"  # Replace with your bucket name
#file_name = "apartments.csv"
write_to_gcs(bucket_name, apartment_data)


# Close the browser
driver.quit()

Collecting selenium-wire
  Downloading selenium_wire-5.1.0-py3-none-any.whl.metadata (49 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/49.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting brotli>=1.0.9 (from selenium-wire)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)
Collecting kaitaistruct>=0.7 (from selenium-wire)
  Downloading kaitaistruct-0.10-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting selenium>=4.0.0 (from selenium-wire)
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting wsproto>=0.14 (from selenium-wire)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting zstandard>=0.14.1 (from selenium-wire)
  Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.m

RefreshError: ("Failed to retrieve http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/?recursive=true from the Google Compute Engine metadata service. Status: 404 Response:\nb''", <google.auth.transport.requests._Response object at 0x7fa72e5e9c00>)

In [None]:
pip freeze > requirements.txt