# This is the apartments.com scraping notebook!
## This notebook does not sort through or clean the data. 

In [1]:
import re
import time
import random
import requests
import numpy as np
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

# Input desired locations in the cell below

In [2]:
# Start with what state you're searching
state_initials = [
    'nj',
    ]
# Add or remove search locations as needed
search_locations = [
    "somerset",
    "hillsborough",
    "raritan",
    "manville",
    "bound brook",
    "new brunswick",
    "franklin township",
]

In [None]:
def get_firefox_useragent_list():
    """Go to the github page and get the list of firefox user agents"""
    ua_page = requests.get('https://raw.githubusercontent.com/tamimibrahim17/List-of-user-agents/master/Firefox.txt') 
    user_agent_list = ua_page.text.split('\n')
    return user_agent_list

def hit_page(url, user_agent):
    """Go to the provided url, using the given user agent information"""
    page = requests.get(url, headers={'User-Agent': user_agent})
    soup = BeautifulSoup(page.content, 'html.parser')
    return(soup)

def get_residence_info(residence_info):
    """When we go to the residence page, get the info about the building."""
    residence_name = residence_info.find(id='propertyName').text.strip()
    residence_address = residence_info.find('div', class_='propertyAddressContainer').find_all('span')[:4]
    residence_address = [r.text for r in residence_address]
    residence_address = ' '.join(residence_address)
    rating = residence_info.find('span', class_='reviewRating')
    if rating is not None:
        rating = rating.text.strip()
    return residence_name, residence_address, rating

def get_apartment_information(apartment_model):
    """Get the information for each apartment listed in the residence"""
    model_name = apartment_model.find('span', class_='modelName').text
    price = apartment_model.find('span', class_='rentLabel').text
    room_details = [room.text for room in apartment_model.find('span', class_='detailsTextWrapper').find_all('span')]
    beds = room_details[0]
    baths = room_details[1]
    square_footage = room_details[2]
    details = apartment_model.find('span', class_='detailsTextWrapper leaseDepositLabel').text
    availability = apartment_model.find('div', class_='availability')
    if availability is not None:
        availability = availability.text
    return model_name, price, beds, baths, square_footage, details, availability



In [None]:
# """Search for the info on each apartment for each residence that we find when searching through our locations."""
""" 
    Initialize variables and set booleans
    For each search_location in the set of search_locations:
        Select a random user agent id
        Go to the apartments.com results for the search 
        Pull out the set of residences returned by the search
        For each residence (apartment building) in the set of residences:
            Find the url for the dedicated residence page
            Check to see if it's some thing we've seen before
            If we've seen it, continue to the next, otherwise add it to our list and process it
            Go to the dedicated page
            Pull out the information on the residence itself
            Pull out list of aparment options within each residence
            For each apartment in the apartments listed:
                Pull out the details (price, square footage, etc.)
                Append them to our list of apartment details

Collect everything
Write csv

Fin
"""
# Initialize and set variables
talk = True # True prints updates
visited_links = set()
all_info = []
counter = 0
user_agent_list = get_firefox_useragent_list()

# Iterate through our search locations to get the data for each area.
for search_location in search_locations:

    if talk == True:
        print(f"Searching {search_location} for apartments")

    # Rotate a series of search locations through the url to collect the search results from each page.
    # For each location, use a random user agent from our list.
    user_agent = random.choice(user_agent_list)
    soup = hit_page(f'https://www.apartments.com/{search_location}-{state_initials}/', user_agent=user_agent)
    search_results = soup.find(id='placards')
    residences = search_results.find_all('li', class_='mortar-wrapper')
    if talk == True:
        print("We found", len(residences), "different residences in", search_location)


    for residence in residences:

        # Check to see if this is an apartment we've already seen. If it is, continue on to the next one.
        # Otherwise add it to the list of links we've hit and proceed to pull the information.
        link_to_building = residence.find(class_='property-link')['href']

        if (link_to_building in visited_links):
            continue
        else:
            visited_links.add(link_to_building)

        # Wait before hitting the residence page.
        time.sleep(5)
        residence_page = hit_page(link_to_building, user_agent=user_agent)
        
        # Get the name of the building, it's adress, and the apartments.com rating
        residence_name, residence_address, rating = get_residence_info(residence_page.find('div', class_='profilePropertyInfoWrapper'))

        # Make an array of the apartment options on the page
        apartment_options = residence_page.find_all(id='priceGridModelWrapper')

        # For each apartment in our list of options, get the important info
        for apartment in apartment_options:
            model_name, price, beds, baths, square_footage, details, availability = get_apartment_information(apartment)

            # Combine all of our information about each option into one array
            info = [
                residence_name,
                model_name,
                beds,
                baths,
                square_footage,
                price,
                details,
                availability,
                residence_address,
                link_to_building,
                search_location,
            ]
            if talk == True:
                print("We found the", model_name, "at the", residence_name)

            # Append our array of info about the apartment to our array of arrays
            all_info.append(info)

    counter = counter + 1

# Convert our array of arrays into a pandas frame
df = pd.DataFrame(all_info, 
                    columns = ['residence_name', 'model_name', 'beds', 'baths', 'square_footage', 'price', 'details', 'availability', 'residence_address', 'link', 'search_location'])

todays_date = datetime.today().strftime("%Y-%m-%d")

# Choose between saving it in data/scraped_data/ or just data/
# df.to_csv('data/scraped_data/' + todays_date + '_apartments.csv', index=False)
df.to_csv('data/' + todays_date + '_apartments.csv', index=False)

# Show us what we found
df
    