This File Cleans the pulled data from the Apify API

# Date of Interest

In [91]:
date = '2024-10-09'

In [92]:
filename = f'dataset_zillow-scraper_{date}.csv'

# Import Packages

In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import yaml
import datetime

# Load the configuration file
with open(r'C:/Users/Dev/Documents/Real Estate Data/config/config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Directories

In [94]:
# Imported Data Directory
raw_dir = config['data']['zillow_data']['raw']['listings']['zillow_search_scraper']

# Cleaned Data Directory
processed_dir = config['data']['zillow_data']['processed']['listings']['zillow_search_scraper']

# Files

In [95]:
# Imported Data 
raw_file = filename

# Cleaned Data
processed_file = f'{date}_zillow_cleaned.csv'

# Functions

In [96]:
def convert_sqft_to_acres(df):
    # Conversion factor from square feet to acres
    sqft_to_acres = 1 / 43560

    # Using the .loc accessor to modify rows where the Lot Area Unit is 'sqft'
    # Convert and round the area to three decimal places
    df.loc[df['lotAreaUnit'] == 'sqft', 'lotAreaValue'] = (df['lotAreaValue'] * sqft_to_acres).round(3)
    df.loc[df['lotAreaUnit'] == 'sqft', 'lotAreaUnit'] = 'acres'

    df.loc[df['lotAreaUnit'] == 'acres', 'lotAreaValue'] = (df['lotAreaValue']).round(3)
    return df



def convert_unix_timestamp(df):
    """
    Convert Unix timestamps in milliseconds in the 'datePriceChanged' column to human-readable dates.
    Handles NaN values gracefully.

    Parameters:
    df (pd.DataFrame): DataFrame containing the 'datePriceChanged' column

    Returns:
    pd.DataFrame: DataFrame with 'datePriceChanged' column converted to readable dates
    """
    def convert_timestamp(timestamp_ms):
        if pd.isna(timestamp_ms):
            return timestamp_ms
        # Convert milliseconds to seconds
        timestamp_s = timestamp_ms / 1000
        # Convert to a datetime object and format
        return datetime.datetime.fromtimestamp(timestamp_s).strftime('%Y-%m-%d')

    # Apply conversion to the 'datePriceChanged' column
    df['datePriceChanged'] = df['datePriceChanged'].apply(convert_timestamp)
    
    return df


def create_web_url(row):
    # Concatenate the address components
    address_string = f"{row['streetAddress']} {row['city']} {row['state']} {row['zipcode']}"
    # Replace spaces with hyphens
    web_url = address_string.replace(' ', '-')
    # Append zipid with suffix
    web_url_with_suffix = f"{web_url}/{row['zipID']}_zpid/"
    return web_url_with_suffix

### Load Data Frame

In [97]:
file_path = os.path.join(raw_dir, raw_file)
df = pd.read_csv(file_path, low_memory=False)

In [98]:
#df

# Clean Data Process

### Determine which columns to keep

In [99]:
columns_to_keep = [

    'addressCity',
    'addressState',
    'addressStreet',
    'addressZipcode',
    'area',
    'baths',
    'beds',
    'brokerName',
    'builderName',
    'detailUrl',
    'flexFieldText',
    'flexFieldType',	
    'hdpData/homeInfo/datePriceChanged',	
    'hdpData/homeInfo/daysOnZillow',	
    'hdpData/homeInfo/homeStatus',	
    'hdpData/homeInfo/homeStatusForHDP',
    'hdpData/homeInfo/homeType',	
    'hdpData/homeInfo/isNonOwnerOccupied',	
    'hdpData/homeInfo/isPreforeclosureAuction',	
    'hdpData/homeInfo/isPremierBuilder',	
    'hdpData/homeInfo/isUnmappable',	
    'hdpData/homeInfo/latitude',	
    'hdpData/homeInfo/listing_sub_type/is_FSBA',	
    'hdpData/homeInfo/listing_sub_type/is_newHome',	
    'hdpData/homeInfo/livingArea',	
    'hdpData/homeInfo/longitude',	
    'hdpData/homeInfo/lotAreaUnit',	
    'hdpData/homeInfo/lotAreaValue',	
    'hdpData/homeInfo/newConstructionType',	
    'hdpData/homeInfo/price',
    'hdpData/homeInfo/priceChange',	
    'hdpData/homeInfo/priceForHDP',	
    'hdpData/homeInfo/priceReduction',	
    'hdpData/homeInfo/rentZestimate',	
    'hdpData/homeInfo/shouldHighlight',	
    'hdpData/homeInfo/taxAssessedValue',	
    'hdpData/homeInfo/timeOnZillow',	
    'hdpData/homeInfo/zestimate',
    
    'isFeaturedListing',	
    'isHomeRec',	
    'isPropertyResultCDP',		
    'isShowcaseListing',	
    'isUndisclosedAddress',	
    'isUserClaimingOwner',	
    'isUserConfirmedClaim',	
    'isZillowOwned',		
    'list',	
    #'lotAreaString',
    'marketingStatusSimplifiedCd',	
    'pgapt',	
    'rawHomeStatusCd',	
    'relaxed',	
    'sgapt',	
    'shouldShowZestimateAsPrice',	
    'statusText',	
    'statusType',	
    'unformattedPrice',	
    'id',

    'carouselPhotos/0/url',
    'carouselPhotos/1/url',
    'carouselPhotos/2/url',
    'carouselPhotos/3/url',	
    'carouselPhotos/4/url',
    'carouselPhotos/5/url',
    'carouselPhotos/6/url',
    'carouselPhotos/7/url',
    'carouselPhotos/8/url',
    'carouselPhotos/9/url',
    'carouselPhotos/10/url',
    'carouselPhotos/11/url',
    'carouselPhotos/12/url',
    'carouselPhotos/13/url',
    'carouselPhotos/14/url',
    'carouselPhotos/15/url',
    'carouselPhotos/16/url',
    'carouselPhotos/17/url',
    'carouselPhotos/18/url',
    'carouselPhotos/19/url',

]

df[columns_to_keep]
df = df[columns_to_keep]

### Rename Columns

In [100]:
column_rename = {

    'id': 'zipID',
    'addressCity': 'city',
    'addressState': 'state',
    'addressStreet': 'streetAddress',
    'addressZipcode': 'zipcode',
    'baths': 'baths',
    'beds': 'beds',
    'brokerName': 'brokerName',
    'builderName': 'builderName',
    'detailUrl': 'url',
    'flexFieldText': 'flexFieldText',
    'flexFieldType': 'flexFieldType',	
    'hdpData/homeInfo/datePriceChanged': 'datePriceChanged',	
    'hdpData/homeInfo/daysOnZillow': 'daysOnZillow',	
    'hdpData/homeInfo/homeStatus': 'homeStatus',	
    'hdpData/homeInfo/homeStatusForHDP': 'homeStatusForHDP',
    'hdpData/homeInfo/homeType': 'homeType',	
    'hdpData/homeInfo/isNonOwnerOccupied': 'isNonOwnerOccupied',	
    'hdpData/homeInfo/isPreforeclosureAuction': 'isPreforeclosureAuction',	
    'hdpData/homeInfo/isPremierBuilder': 'isPremierBuilder',	
    'hdpData/homeInfo/isUnmappable': 'isUnmappable',	
    'hdpData/homeInfo/latitude': 'latitude',	
    'hdpData/homeInfo/listing_sub_type/is_FSBA': 'is_FSBA',	
    'hdpData/homeInfo/listing_sub_type/is_newHome': 'is_newHome',	
    'hdpData/homeInfo/livingArea': 'sqft',	
    'hdpData/homeInfo/longitude': 'longitude',	
    'hdpData/homeInfo/lotAreaUnit': 'lotAreaUnit',	
    'hdpData/homeInfo/lotAreaValue': 'lotAreaValue',	
    'hdpData/homeInfo/newConstructionType': 'newConstructionType',	
    'hdpData/homeInfo/price': 'price',
    'hdpData/homeInfo/priceChange': 'priceChange',	
    'hdpData/homeInfo/priceForHDP': 'priceForHDP',	
    'hdpData/homeInfo/priceReduction': 'priceReduction',	
    'hdpData/homeInfo/rentZestimate': 'rentZestimate',	
    'hdpData/homeInfo/shouldHighlight': 'shouldHighlight',	
    'hdpData/homeInfo/taxAssessedValue': 'taxAssessedValue',	
    'hdpData/homeInfo/timeOnZillow': 'timeOnZillow',	
    'hdpData/homeInfo/zestimate': 'zestimate',
    'marketingStatusSimplifiedCd': 'listingType',
    'carouselPhotos/0/url': 'photo1',
    'carouselPhotos/1/url': 'photo2',
    'carouselPhotos/2/url': 'photo3',
    'carouselPhotos/3/url': 'photo4',	
    'carouselPhotos/4/url': 'photo5',
    'carouselPhotos/5/url': 'photo6',
    'carouselPhotos/6/url': 'photo7',
    'carouselPhotos/7/url': 'photo8',
    'carouselPhotos/8/url': 'photo9',
    'carouselPhotos/9/url': 'photo10',
    'carouselPhotos/10/url': 'photo11',
    'carouselPhotos/11/url': 'photo12',
    'carouselPhotos/12/url': 'photo13',
    'carouselPhotos/13/url': 'photo14',
    'carouselPhotos/14/url': 'photo15',
    'carouselPhotos/15/url': 'photo16',
    'carouselPhotos/16/url': 'photo17',
    'carouselPhotos/17/url': 'photo18',
    'carouselPhotos/18/url': 'photo19',
    'carouselPhotos/19/url': 'photo20',

}

# Rename columns
df = df.rename(columns=column_rename)

### Convert Values

In [101]:
# Converts Lot Sqft to Acres
df = convert_sqft_to_acres(df)

# Converts 'datePriceChanged' column
df = convert_unix_timestamp(df)

### Reoder Columns

In [102]:
columns_to_keep = [

        # Basic Property Information
        'zipID',
        'city',
        'state',
        'streetAddress',
        'zipcode',
        'latitude',
        'longitude',
        
        # Property Details
        'beds',
        'baths',
        'homeType',
        'sqft',
        'lotAreaValue',
        
        # Pricing Information
        'price',
        'priceChange',
        'datePriceChanged',
        'zestimate',
        'rentZestimate',
        'taxAssessedValue',
        
        # Listing Details
        'listingType',
        'daysOnZillow',
        'statusText',
        'statusType',
        'isFeaturedListing',
        'isShowcaseListing',
        
        # Construction and Broker Information
        'brokerName',
        'builderName',
        'newConstructionType',
        'isPremierBuilder',
        'is_newHome',
        
        # Miscellaneous
        'flexFieldText',
        'flexFieldType',
      
        
        # Deep Links
        'url',
        #'photo1',
        #'photo2',
        #'photo3',
        #'photo4',	
        #'photo5',
        #'photo6',
        #'photo7',
        #'photo8',
        #'photo9',
        #'photo10',
        #'photo11',
        #'photo12',
        #'photo13',
        #'photo14',
        #'photo15',
        #'photo16',
        #'photo17',
        #'photo18',
        #'photo19',
        #'photo20',

]

df[columns_to_keep]
df = df[columns_to_keep]

df.rename(columns={'lotAreaValue': 'acreage'}, inplace=True)

### Call the function to create the web URL

In [103]:
df['webURL'] = df.apply(create_web_url, axis=1)

In [104]:
df

Unnamed: 0,zipID,city,state,streetAddress,zipcode,latitude,longitude,beds,baths,homeType,...,isShowcaseListing,brokerName,builderName,newConstructionType,isPremierBuilder,is_newHome,flexFieldText,flexFieldType,url,webURL
0,351078673,Huffman,TX,"1647 Plan, Woodland Lakes",77336.0,30.030027,-95.112015,3.0,3.0,SINGLE_FAMILY,...,False,,Cyrene Homes,BUILDER_PLAN,True,True,Lake breezes,homeInsight,https://www.zillow.com/community/woodland-lake...,"1647-Plan,-Woodland-Lakes-Huffman-TX-77336.0/3..."
1,28020741,Houston,TX,10407 Royal Oaks Dr,77016.0,29.862747,-95.302640,3.0,1.0,SINGLE_FAMILY,...,True,Corcoran Prestige Realty,,,False,,Showcase,showcase,https://www.zillow.com/homedetails/10407-Royal...,10407-Royal-Oaks-Dr-Houston-TX-77016.0/2802074...
2,28270108,Houston,TX,8218 Windy Creek Dr,77040.0,29.890194,-95.544760,3.0,2.0,SINGLE_FAMILY,...,False,NB Elite Realty,,,False,,High ceilings,homeInsight,https://www.zillow.com/homedetails/8218-Windy-...,8218-Windy-Creek-Dr-Houston-TX-77040.0/2827010...
3,28276236,Houston,TX,13102 Creekside Park Dr,77082.0,29.724209,-95.619800,3.0,3.0,SINGLE_FAMILY,...,False,"Compass RE Texas, LLC - Houston",,,False,,"Price cut: $15,000 (Oct 02)",priceCut,https://www.zillow.com/homedetails/13102-Creek...,13102-Creekside-Park-Dr-Houston-TX-77082.0/282...
4,84020947,Houston,TX,15415 Liberty Prairie Ct,77049.0,29.835371,-95.164170,5.0,4.0,SINGLE_FAMILY,...,False,Torres Real Estate Group,,,False,,Beautifully remodeled bathroom,homeInsight,https://www.zillow.com/homedetails/15415-Liber...,15415-Liberty-Prairie-Ct-Houston-TX-77049.0/84...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9808,2079939074,Houston,TX,5003 Higgins St #A,77033.0,29.654604,-95.348470,3.0,3.0,SINGLE_FAMILY,...,False,Roam Brokerage 752023613,,,False,,"Price cut: $8,000 (Sep 30)",priceCut,https://www.zillow.com/homedetails/5003-Higgin...,5003-Higgins-St-#A-Houston-TX-77033.0/20799390...
9809,27982729,Houston,TX,3014 Elpyco St,77051.0,29.645168,-95.379196,3.0,2.0,SINGLE_FAMILY,...,False,REALM Real Estate Professionals - North Houston,,,False,,Original hardwood flooring,homeInsight,https://www.zillow.com/homedetails/3014-Elpyco...,3014-Elpyco-St-Houston-TX-77051.0/27982729_zpid/
9810,2055940257,Houston,TX,5617 Madden Ln,77048.0,29.633030,-95.335106,3.0,2.0,SINGLE_FAMILY,...,False,Meritage Homes Realty,Meritage Homes,BUILDER_SPEC,True,True,Dark gray tweed carpet,homeInsight,https://www.zillow.com/homedetails/5617-Madden...,5617-Madden-Ln-Houston-TX-77048.0/2055940257_z...
9811,2060332592,Houston,TX,2915 Newington Ln,77047.0,29.620047,-95.376785,3.0,2.0,SINGLE_FAMILY,...,False,Collective Realty Co.,,,False,,Modern elegance,homeInsight,https://www.zillow.com/homedetails/2915-Newing...,2915-Newington-Ln-Houston-TX-77047.0/206033259...


# Remove Unintential Duplicates

In [105]:
# Replace "House for Sale" with "Home for Sale" in the 'statusText' column
df['statusText'] = df['statusText'].replace('House for Sale', 'Home for Sale')

In [106]:
df

Unnamed: 0,zipID,city,state,streetAddress,zipcode,latitude,longitude,beds,baths,homeType,...,isShowcaseListing,brokerName,builderName,newConstructionType,isPremierBuilder,is_newHome,flexFieldText,flexFieldType,url,webURL
0,351078673,Huffman,TX,"1647 Plan, Woodland Lakes",77336.0,30.030027,-95.112015,3.0,3.0,SINGLE_FAMILY,...,False,,Cyrene Homes,BUILDER_PLAN,True,True,Lake breezes,homeInsight,https://www.zillow.com/community/woodland-lake...,"1647-Plan,-Woodland-Lakes-Huffman-TX-77336.0/3..."
1,28020741,Houston,TX,10407 Royal Oaks Dr,77016.0,29.862747,-95.302640,3.0,1.0,SINGLE_FAMILY,...,True,Corcoran Prestige Realty,,,False,,Showcase,showcase,https://www.zillow.com/homedetails/10407-Royal...,10407-Royal-Oaks-Dr-Houston-TX-77016.0/2802074...
2,28270108,Houston,TX,8218 Windy Creek Dr,77040.0,29.890194,-95.544760,3.0,2.0,SINGLE_FAMILY,...,False,NB Elite Realty,,,False,,High ceilings,homeInsight,https://www.zillow.com/homedetails/8218-Windy-...,8218-Windy-Creek-Dr-Houston-TX-77040.0/2827010...
3,28276236,Houston,TX,13102 Creekside Park Dr,77082.0,29.724209,-95.619800,3.0,3.0,SINGLE_FAMILY,...,False,"Compass RE Texas, LLC - Houston",,,False,,"Price cut: $15,000 (Oct 02)",priceCut,https://www.zillow.com/homedetails/13102-Creek...,13102-Creekside-Park-Dr-Houston-TX-77082.0/282...
4,84020947,Houston,TX,15415 Liberty Prairie Ct,77049.0,29.835371,-95.164170,5.0,4.0,SINGLE_FAMILY,...,False,Torres Real Estate Group,,,False,,Beautifully remodeled bathroom,homeInsight,https://www.zillow.com/homedetails/15415-Liber...,15415-Liberty-Prairie-Ct-Houston-TX-77049.0/84...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9808,2079939074,Houston,TX,5003 Higgins St #A,77033.0,29.654604,-95.348470,3.0,3.0,SINGLE_FAMILY,...,False,Roam Brokerage 752023613,,,False,,"Price cut: $8,000 (Sep 30)",priceCut,https://www.zillow.com/homedetails/5003-Higgin...,5003-Higgins-St-#A-Houston-TX-77033.0/20799390...
9809,27982729,Houston,TX,3014 Elpyco St,77051.0,29.645168,-95.379196,3.0,2.0,SINGLE_FAMILY,...,False,REALM Real Estate Professionals - North Houston,,,False,,Original hardwood flooring,homeInsight,https://www.zillow.com/homedetails/3014-Elpyco...,3014-Elpyco-St-Houston-TX-77051.0/27982729_zpid/
9810,2055940257,Houston,TX,5617 Madden Ln,77048.0,29.633030,-95.335106,3.0,2.0,SINGLE_FAMILY,...,False,Meritage Homes Realty,Meritage Homes,BUILDER_SPEC,True,True,Dark gray tweed carpet,homeInsight,https://www.zillow.com/homedetails/5617-Madden...,5617-Madden-Ln-Houston-TX-77048.0/2055940257_z...
9811,2060332592,Houston,TX,2915 Newington Ln,77047.0,29.620047,-95.376785,3.0,2.0,SINGLE_FAMILY,...,False,Collective Realty Co.,,,False,,Modern elegance,homeInsight,https://www.zillow.com/homedetails/2915-Newing...,2915-Newington-Ln-Houston-TX-77047.0/206033259...


# Save File

In [107]:
# Define the file path
csv_file_path = os.path.join(processed_dir, f'{date}_zillow_house_listings.csv')
df.to_csv(csv_file_path, index=False)