## SI507 Final Project : Data Checkpoint
#### Author: Deji Suolang

This python sciprt contains the code for data collection, census API and webscripping results from Zillow's website.

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plot
import requests
import json
import os
import time
import sys
import regex as re
import lxml
import numbers
from bs4 import BeautifulSoup
import csv

### Data Source 1: Census API 

In [2]:
# Caching
CACHE_FILENAME = 'cache.json'
CACHE_DICT = {}

In [33]:
def open_cache():
    ''' opens the cache file if it exists and loads the JSON into
    the FIB_CACHE dictionary.
    
    if the cache file doesn't exist, creates a new cache dictionary
    Parameters
    ----------
    None
    
    Returns
    -------
    The opened cache: dict
    '''
    try:
        cache_file = open(CACHE_FILENAME, 'r')
        cache_contents = cache_file.read()
        cache_dict = json.loads(cache_contents)
        cache_file.close()
    except:
        cache_dict = {}
    return cache_dict


# converts the dictionary to JSON and saves it
def save_cache(cache_dict):
    ''' saves the current state of the cache to disk
    Parameters
    ----------
    cache_dict: dict
        The dictionary to save
    Returns
    -------
    None
    '''

    dumped_json_cache = json.dumps(cache_dict)
    fw = open(CACHE_FILENAME,"w")
    fw.write(dumped_json_cache)
    fw.close()

In [34]:
def get_census_data(url):
    '''this function will retrieve the median income data from Census API data
    attainment data
    '''
    if os.path.isfile('cache.json') and os.access('cache.json', os.R_OK):
        with open('cache.json', 'r', newline='') as cache_file:
            cache = json.load(cache_file)
            return cache

    else:
        cache = requests.get(url).text
        census_data = json.loads(cache)
        save_cache(census_data)
        return census_data

def clean_census_data(census_data):
    '''cleans data and use county as the index
    '''
    county_dict = {}

    for item in census_data:
        county_dict[item[3].lower()] = item[1]
        county_dict['county'] = 'median_income'
        
    return county_dict

In [35]:
if __name__ == "__main__":
    #Calling Census functions
    census_data = get_census_data("https://api.census.gov/data/2019/acs/acs5?get=NAME,B19013_001E&for=tract:*&in=state:26&key=dfdb2a1eda26b816e1c7d71114a341b40b1b40b4")
    census_data = clean_census_data(census_data)
    with open('census_data.csv', 'w') as f:
        [f.write('{0},{1}\n'.format(key, value)) for key, value in census_data.items()]

### Data Source 2: Zillow Web Scraping

In [49]:
headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }

# We are intersted in the for sale home in the state of Michigan
base_url = "https://www.zillow.com/homes/for_sale/"
state = 'mi'

def makesoup(data):
    with requests.Session() as s:
        r = s.get(data, headers=headers)
        soup = BeautifulSoup(r.content, 'html.parser')
    return soup

def get_zillow_results(data):
    '''obtain useful variables from zillow scrapping results'
    '''
    address = data.find_all(class_= 'list-card-addr')
    price = list(data.find_all(class_='list-card-price'))
    bed_num = list(data.find_all("ul", class_="list-card-details"))
     #create dataframe columns out of variables
    df['prices'] = price
    df['address'] = address
    df['bed_num'] = bed_num
    return df.copy()

def get_url_list(base_url, state):
    url_list = []
    url_list.append(base_url +state+'/')
    for i in range(2,26):
        domain = base_url + state +'/'+str(i)+'_p/'
        url_list.append(domain)
    return url_list

def create_soup_list(url_list):
    soup_list = []
    for url in url_list:
        htmls = makesoup(url)
        soup_list.append(htmls)
    return soup_list

def create_dataframe_list(soup_list):
    df_list = []
    for soup in soup_list:
        new_df = get_zillow_results(soup)
        df_list.append(new_df)
    return df_list

In [51]:
url_list = get_url_list(base_url, state)
soup_list = create_soup_list(url_list)

In [52]:
df = pd.DataFrame()
zillow_df = pd.DataFrame()
df_list = create_dataframe_list(soup_list)

In [59]:
# Combine the list of datasets into our new dataframe
zillow_df = pd.concat(df_list)
zillow_df.reset_index(inplace=True)
zillow_df = zillow_df.drop('index', axis=1)
print(zillow_df.shape)
zillow_df.head()

(225, 3)


Unnamed: 0,prices,address,bed_num
0,"[$400,000]","[31008 Applewood Ln, Farmington, MI 48331]","[[4, [ , , bds]], [3, [ , , ba]], [2,944, [ ..."
1,"[$425,000]","[5372 Hauser Way, West Bloomfield, MI 48323]","[[4, [ , , bds]], [4, [ , , ba]], [3,213, [ ..."
2,"[$460,000]","[11526 Toledo Ave, Manitou Beach, MI 49253]","[[3, [ , , bds]], [2, [ , , ba]], [1,708, [ ..."
3,"[$579,900]","[6479 Island Lake Dr, East Lansing, MI 48823]","[[4, [ , , bds]], [4, [ , , ba]], [4,117, [ ..."
4,"[$75,000]","[26 Maple Ct, Muskegon, MI 49445]","[[3, [ , , bds]], [2, [ , , ba]], [1,512, [ ..."


In [60]:
zillow_df = zillow_df.applymap(str)
zillow_df = zillow_df.applymap(lambda x: re.sub('<[^<]+?>', '',x))
zillow_df[['bed_num', 'home_type']] = zillow_df.bed_num.str.split('-',n=1, expand=True)
zillow_df[['address', 'city','zip_code']] = zillow_df.address.str.split(",", expand=True)
zillow_df.head()

In [61]:
# separate bed_num column into bed, bath, and sq_feet
zillow_df[['bed_num', 'baths', 'sqft']] = zillow_df.bed_num.str.split(' ',n=2, expand=True)
zillow_df['sqft'] = zillow_df.sqft.str.replace(",", "")
# extract only the digits from the columns
zillow_df['bed_num'] = zillow_df.bed_num.str.extract('(\d+)')
zillow_df['baths'] = zillow_df.baths.str.extract('(\d+)')
zillow_df['sqft'] = zillow_df.sqft.str.extract('(\d+)')
# convert columns to float
zillow_df['bed_num'] = zillow_df['bed_num'].astype('float')
zillow_df['baths'] = zillow_df['baths'].astype('float')
zillow_df['sqft'] = zillow_df['sqft'].astype('float')

In [62]:
zillow_df[['state', 'zipcode']] = zillow_df.zip_code.str.split(expand=True)
zillow_df['zipcode'] = zillow_df['zipcode'].astype('int')

In [63]:
# add crosswalk between zipcode and county
zip_county_crosswalk = pd.read_csv('zip_county_crosswalk.csv')
# merge two dataset in order to add county infiormation
zillow_df = zillow_df.merge(zip_county_crosswalk, how='inner', on='zipcode')

In [64]:
zillow_df.to_csv('zillow_data.csv')