In [53]:
import requests
import json
import time
import datetime
import pandas as pd
import numpy as np
import random
from lxml.html import fromstring
from itertools import cycle
import traceback

In [2]:
# segments boundaries: split territory on the equal horizontal slices
with open('../boundaries/new_york.json', 'r') as d:
    bounds = json.load(d)

In [29]:
# make link from template, bounds and request_id 
def make_top_url(bounds, request_id):
    part_1 = 'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{},"mapBounds":{'
    part_2 = '"west":{west},"east":{east},"south":{south},"north":{north}'.format(**bounds)
    part_3 = '},"isMapVisible":true,"filterState":{"isForSaleByAgent":{"value":false},"isForSaleByOwner":{"value":false},"isNewConstruction":{"value":false},"isForSaleForeclosure":{"value":false},"isComingSoon":{"value":false},"isAuction":{"value":false},"isPreMarketForeclosure":{"value":false},"isPreMarketPreForeclosure":{"value":false},"isForRent":{"value":true},"isAllHomes":{"value":true},"enableSchools":{"value":true}},"isListVisible":true,"mapZoom":9}&wants={"cat1":["listResults","mapResults"],"schoolResults":["schoolResults"]}&requestId='
    
    request_id = int(request_id)
    request_id = request_id - 10 * int(request_id/10)
    
    return part_1 + part_2 + part_3 + str(request_id)

In [54]:
# headers
user_agent_list = [
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]


headers = {
    'Host': 'www.zillow.com',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'
}

In [64]:
def transform_data(raw_details):
    # function transforms json details to data frame format
    
    raw_details_new = {
        'school_id': raw_details['school_id'] if 'school_id' in raw_details.keys() else np.nan,
        'name': raw_details['name'] if 'name' in raw_details.keys() else np.nan,
        'lat': raw_details['location']['lat'] if 'location' in raw_details.keys() else np.nan,
        'lan': raw_details['location']['lon'] if 'location' in raw_details.keys() else np.nan,
        'attendance_zones': ','.join(str(x) for x in raw_details['attendance_zones']) if 'attendance_zones' in raw_details.keys() else np.nan,
        'gs_rating': raw_details['gs_rating'] if 'gs_rating' in raw_details.keys() else np.nan,
        'enrollment': raw_details['enrollment'] if 'enrollment' in raw_details.keys() else np.nan,
        'students_per_teacher': raw_details['students_per_teacher'] if 'students_per_teacher' in raw_details.keys() else np.nan,
        'link': raw_details['link'] if 'link' in raw_details.keys() else np.nan,
        'grades': raw_details['grades'] if 'grades' in raw_details.keys() else np.nan,
        'is_elementary': raw_details['is_elementary'] if 'is_elementary' in raw_details.keys() else np.nan,
        'is_middle': raw_details['is_middle'] if 'is_middle' in raw_details.keys() else np.nan,
        'is_high': raw_details['is_high'] if 'is_high' in raw_details.keys() else np.nan,
        'is_public': raw_details['is_public'] if 'is_public' in raw_details.keys() else np.nan,
        'is_private': raw_details['is_private'] if 'is_private' in raw_details.keys() else np.nan,
        'is_charter': raw_details['is_charter'] if 'is_charter' in raw_details.keys() else np.nan
    }
    

    #df_l = pd.DataFrame.from_dict(raw_details_new, orient='columns', index=[0], dtype=None, columns=None)
    df_l = pd.DataFrame(raw_details_new, index=[0])
    
    return df_l

In [65]:
df_all = pd.DataFrame()


logs = open('output/logs.txt', 'w')
logs.write('Scrapping schools {}\n'.format(datetime.date.today().strftime('%Y-%m-%d')))


broken_segments = []


for segment in bounds.keys():
    
    headers['User-Agent'] = random.choice(user_agent_list)
    
    url_top = make_top_url(bounds[segment], segment)
    response_top = requests.get(url_top, headers=headers)

    if response_top.status_code != 200:
        time.sleep(60)
        response_top = requests.get(url_top, headers=headers)

    try:
        if response_top.status_code != 200:
            broken_segments.append(segment)

        else:
            if 'total' in response_top.json()['schoolResults'].keys():
                total_results = response_top.json()['schoolResults']['total']
                received_results = len(response_top.json()['schoolResults']['schools'])
            else:
                total_results = 'unknown'
                
            log_message = 'Segment: {}, Total objects: {}, Received objects: {}\n'.format(segment, total_results, received_results)
            logs.write(log_message)
            
            print('Segment: {}, Total objects: {}, Received objects: {}'.format(segment, total_results, received_results))         
            
            
            for school in response_top.json()['schoolResults']['schools']:
                df_part = transform_data(school)
                
                df_all = df_all.append(df_part)


    except:
        continue
    
    
    time.sleep(5)


        
if len(broken_segments) > 0:
    log_message = 'Broken Segments: {}\n\n'.format(', '.join(broken_segments))

    logs.write(log_message)   


logs.close()


df_all.to_csv('output/schools.csv', index=False)

Segment: 1, Total objects: 10, Received objects: 10
Segment: 2, Total objects: 3, Received objects: 3
Segment: 3, Total objects: 2, Received objects: 2
Segment: 4, Total objects: 5, Received objects: 5
Segment: 5, Total objects: 3, Received objects: 3
Segment: 6, Total objects: 3, Received objects: 3
Segment: 7, Total objects: 5, Received objects: 5
Segment: 8, Total objects: 5, Received objects: 5
Segment: 9, Total objects: 4, Received objects: 4
Segment: 10, Total objects: 2, Received objects: 2
Segment: 11, Total objects: 3, Received objects: 3
Segment: 12, Total objects: 3, Received objects: 3
Segment: 13, Total objects: 3, Received objects: 3
Segment: 14, Total objects: 6, Received objects: 6
Segment: 15, Total objects: 8, Received objects: 8
Segment: 16, Total objects: 9, Received objects: 9
Segment: 17, Total objects: 3, Received objects: 3
Segment: 18, Total objects: 3, Received objects: 3
Segment: 19, Total objects: 4, Received objects: 4
Segment: 20, Total objects: 4, Receive