# Web Scraping Real-Estate Data from Zillow
## *A Data Engineering Exercise*
### Scope:
* New York, NY Search Query
* Apartments Only</br>


*This project is for research purposes only. None of the data is being scraped is being sold or utilized for commercial purposes in any way*

# Getting The Data

In [10]:
import requests
from bs4 import BeautifulSoup

header = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
response = requests.get("https://www.zillow.com/new-york-ny/apartments/", headers=header).text
soup = BeautifulSoup(response, 'html.parser')
apts = soup.find_all("div", {"class": "StyledPropertyCardDataWrapper-c11n-8-85-1__sc-1omp4c3-0 jVBMsP property-card-data"})

In [11]:
print(apts[0])

<div class="StyledPropertyCardDataWrapper-c11n-8-85-1__sc-1omp4c3-0 jVBMsP property-card-data"><a class="StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 gdfTyO property-card-link" data-test="property-card-link" href="https://www.zillow.com/apartments/arverne-ny/the-tides-at-arverne-by-the-sea/BLWH7Z/" tabindex="0"><address data-test="property-card-addr">The Tides At Arverne By The Sea, 190 Beach 69th St, Arverne, NY 11692</address></a><div class="StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 cWiizR"></div><div class="StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 bqsBln"><span data-test="property-card-price">$4,200+/mo</span></div><div class="StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 gxlfal"><ul class="StyledPropertyCardHomeDetailsList-c11n-8-85-1__sc-1xvdaej-0 dmDolk"><li><b>2</b> <abbr>bds</abbr></li><li><b>2</b> <abbr>ba</abbr></li><li><b>906</b> <abbr>sqft</abbr></li></ul> <!-- -->- Apartment for rent</div></div>


## Parse soup data to extract relevent info then append into a master list

In [12]:
list = []
obj = {}

for a in range(0, len(apts)):
    try:
        obj["pricing"] = apts[a].find("div", {"class": "StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 bqsBln"}).text
    except: obj["pricing"] = None
    try:
        obj["address"] = apts[a].find("a", {"class": "StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 gdfTyO property-card-link"}).text
    except:
        obj["address"] = None
    try:
        obj["space"] = apts[a].find("div", {"class": "StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 gxlfal"}).text
    except:
        obj["space"] = None
    list.append(obj)
    obj = {}

In [13]:
print(len(list))
for i in list:
    print(i)

9
{'pricing': '$4,200+/mo', 'address': 'The Tides At Arverne By The Sea, 190 Beach 69th St, Arverne, NY 11692', 'space': '2 bds2 ba906 sqft - Apartment for rent'}
{'pricing': '$3,391/mo', 'address': 'RiverTrace at Port Imperial, 11 Avenue At Port Rte #718, West New York, NJ 07093', 'space': '1 bd1 ba765 sqft - Apartment for rent'}
{'pricing': '$4,109+ Studio', 'address': 'Oriana | 420 E 54th St, New York, NY', 'space': '$5,326+ 1 bd$6,636+ 2 bds$11,672+ 3 bds '}
{'pricing': '$3,575+ Studio', 'address': 'Waterside | 25 Waterside Plz, New York, NY', 'space': '$3,995+ 1 bd$5,525+ 2 bds$7,875+ 3 bds '}
{'pricing': '$3,245+ Studio', 'address': 'THE BELLSLIP | 1 Bell Slip, Brooklyn, NY', 'space': '$4,723+ 1 bd$6,600+ 2 bds '}
{'pricing': '$6,720+ 2 bds', 'address': '15 Cliff | 15 Cliff St, New York, NY', 'space': ' '}
{'pricing': '$3,815+ Studio', 'address': 'AVA High Line | 525 W 28th St, New York, NY', 'space': '$4,445+ 1 bd$7,155+ 2 bds '}
{'pricing': '$4,953+ Studio', 'address': 'The Che

## Putting it all together
Request and parse 50 pages of results, and then write the results to a local json file

In [2]:
import requests
from bs4 import BeautifulSoup
import json

finalList = []

header = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
for page in range(1,51):
    try:
        response = requests.get(f"https://www.zillow.com/new-york-ny/apartments/{page}_p", headers=header).text
        soup = BeautifulSoup(response, 'html.parser')
        apts = soup.find_all("div", {"class": "StyledPropertyCardDataWrapper-c11n-8-85-1__sc-1omp4c3-0 jVBMsP property-card-data"})

        list = []
        obj = {}

        for a in range(0, len(apts)):
            try:
                obj["pricing"] = apts[a].find("div", {"class": "StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 bqsBln"}).text
            except: obj["pricing"] = None
            try:
                obj["address"] = apts[a].find("a", {"class": "StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 gdfTyO property-card-link"}).text
            except:
                obj["address"] = None
            try:
                obj["space"] = apts[a].find("div", {"class": "StyledPropertyCardDataArea-c11n-8-85-1__sc-yipmu-0 gxlfal"}).text
            except:
                obj["space"] = None
            list.append(obj)
            obj = {}

        finalList.append(list)
    except:
        pass
    
with open("./output.json", "w+", encoding='utf-8') as file:
    json.dump(finalList, file, ensure_ascii=False, indent=4)

# Data Processing I
* Read from json
* Flatten list of lists into single list
* Create dataframe and save to csv

In [55]:
import pandas as pd
import json
with open("./output.json", "r") as file:
    data = json.load(file)
data_flat = [item for sublist in data for item in sublist]
df = pd.DataFrame(data_flat)
df.to_csv("./unprocessed.csv")
df.head()

Unnamed: 0,pricing,address,space
0,"$3,595+ Studio","Atlas New York | 66 W 38th St, New York, NY","$4,475+ 1 bd"
1,"$3,725+ Studio","Gotham West | 550 W 45th St, New York, NY","$4,495+ 1 bd$7,195+ 2 bds"
2,"$3,445+ Studio","The Ashland | 250 Ashland Pl, Brooklyn, NY","$4,488+ 1 bd$8,071+ 3 bds"
3,"$2,615+ 1 bd","One Archer | 92 160th St, Jamaica, NY","$3,460+ 2 bds"
4,"$3,435/mo","Gateway, 389 S End Ave #2P, New York, NY 10280",Studio 1 ba580 sqft - Apartment for rent


# Data Issues
 This data still requires more processing as it is very messy</br>
* *(Zillow likely does this on purpose to make the data more difficult to collect and process)*</br>
* Some entries include only one offering, while others may contain an offerring for studio, one-bed, two-bed, etc </br>
* Some entries are also missing sqft information
* There are also many different formats for address (Some separate complex name with "|" and some don't)

# Data Processing II

In [114]:
temp = {}
lines = []

for index, row in df.iterrows():

    if row['space'] == " ":
        temp['Layout'] = str(row['pricing']).split("+ ", 1)[1]
        temp['Rent'] = int(str(row['pricing']).split("+ ", 1)[0][1:].replace(",",""))
        temp['Address'] = row['address']

    if '/mo' in row['pricing']:
        temp['Rent'] = int(str(row['pricing']).split("/", 1)[0][1:].replace(",",""))
        temp['Layout'] = str(row['space']).split("ba", 1)[0][:-2]
        temp['Address'] = row['address']
    
    if '+' in row['pricing']:
        spaces = str(row['space']).split("$")
        n = len(spaces)
        for i in range(n):
            if i == 0:
                temp['Address'] = row['address']
                temp['Rent'] = int(str(row['pricing']).split("+", 1)[0][1:].replace(",",""))
                temp['Layout'] = str(row['pricing']).split("+ ", 1)[1].rstrip()
                lines.append(temp)
                temp = {}
            else:
                temp['Address'] = row['address']
                temp['Rent'] = int(spaces[i].split("+", 1)[0].replace(",",""))
                temp['Layout'] = spaces[i].split("+", 1)[1][1:].rstrip()
                lines.append(temp)
                temp = {}

    temp = {}

df2 = pd.DataFrame(lines)
df2.to_csv('./processed.csv')


In [120]:
df2.groupby('Layout', as_index=False)['Rent'].mean().round(0)

Unnamed: 0,Layout,Rent
0,1 bd,4035.0
1,1 bd,3509.0
2,2 bds,4632.0
3,2 bds,6224.0
4,3 bds,7600.0
5,3 bds,6855.0
6,4 bds,3650.0
7,4 bds,8900.0
8,5 bds,2900.0
9,Studio,3047.0


# References
1. https://medium.com/pipeline-a-data-engineering-resource/scrape-clean-and-store-zillow-apartment-data-etl-pipeline-907858b67e98
2. https://stackoverflow.com/questions/952914/how-do-i-make-a-flat-list-out-of-a-list-of-lists
