In [29]:
import pandas as pd

## Load the Raw Data

In [30]:
# Load the CSV file into a DataFrame
df = pd.read_csv('listings_raw_sale.csv')

# Show the first few rows of the DataFrame to understand its structure
df

Unnamed: 0,zpid,rawHomeStatusCd,marketingStatusSimplifiedCd,imgSrc,hasImage,detailUrl,statusType,statusText,price,priceLabel,...,brokerName,lotAreaString,streetViewURL,streetViewMetadataURL,communityName,builderName,hdpData.homeInfo.providerListingID,plid,isCdpResult,isComingSoonCommunity
0,26683152.0,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/fd47d82cf21...,True,/homedetails/4044-Buena-Vista-St-APT-218-Dalla...,FOR_SALE,Condo for sale,"$325,000",$325K,...,,,,,,,,,,
1,26809367.0,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/540e65ee165...,True,/homedetails/3223-Whitehall-Dr-Dallas-TX-75229...,FOR_SALE,House for sale,"$525,000",$525K,...,,,,,,,,,,
2,26903542.0,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/5f815fecdae...,True,/homedetails/10777-Strait-Ln-Dallas-TX-75229/2...,FOR_SALE,House for sale,"$40,000,000",$40.0M,...,,,,,,,,,,
3,26813671.0,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/190d19818f2...,True,/homedetails/5525-Glen-Forest-Ln-Dallas-TX-752...,FOR_SALE,House for sale,"$300,000",$300K,...,,,,,,,,,,
4,96527896.0,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/47f73fa0d00...,True,/homedetails/4002-Agnes-St-Dallas-TX-75210/965...,FOR_SALE,House for sale,"$300,000",$300K,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,26730486.0,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/68d68964e4e...,True,/homedetails/1614-Engle-Ave-Dallas-TX-75224/26...,FOR_SALE,House for sale,"$349,900",$350K,...,,,,,,,,,,
497,26788275.0,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/36f53b487e7...,True,/homedetails/3519-Nogales-Dr-Dallas-TX-75220/2...,FOR_SALE,Multi-family home for sale,"$328,500",$329K,...,,,,,,,,,,
498,26702531.0,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/039eaac5895...,True,/homedetails/6240-Marquita-Ave-Dallas-TX-75214...,FOR_SALE,House for sale,"$925,000",$925K,...,,,,,,,,,,
499,26825874.0,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/9da46d2cf9b...,True,/homedetails/1627-Heather-Glen-Dr-Dallas-TX-75...,FOR_SALE,House for sale,"$300,000",$300K,...,,,,,,,,,,


## Functions

In [31]:
def convert_sqft_to_acres(df):
    # Conversion factor from square feet to acres
    sqft_to_acres = 1 / 43560

    # Using the .loc accessor to modify rows where the Lot Area Unit is 'sqft'
    # Convert and round the area to three decimal places
    df.loc[df['lotareaunit'] == 'sqft', 'lotareavalue'] = (df['lotareavalue'] * sqft_to_acres).round(3)
    df.loc[df['lotareaunit'] == 'sqft', 'lotareaunit'] = 'acres'

    df.loc[df['lotareaunit'] == 'acres', 'lotareavalue'] = (df['lotareavalue']).round(3)
   
    
    return df

## Columns to Keep in Data Frame

In [32]:
df.columns

Index(['zpid', 'rawHomeStatusCd', 'marketingStatusSimplifiedCd', 'imgSrc',
       'hasImage', 'detailUrl', 'statusType', 'statusText', 'price',
       'priceLabel', 'address', 'beds', 'baths', 'area', 'flexFieldText',
       'flexFieldType', 'isUserClaimingOwner', 'isUserConfirmedClaim', 'pgapt',
       'sgapt', 'shouldShowZestimateAsPrice', 'has3DModel', 'hasVideo',
       'isHomeRec', 'hasAdditionalAttributions', 'isFeaturedListing',
       'isShowcaseListing', 'listingType', 'isFavorite', 'visited',
       'timeOnZillow', 'latLong.latitude', 'latLong.longitude',
       'hdpData.homeInfo.zpid', 'hdpData.homeInfo.streetAddress',
       'hdpData.homeInfo.zipcode', 'hdpData.homeInfo.city',
       'hdpData.homeInfo.state', 'hdpData.homeInfo.latitude',
       'hdpData.homeInfo.longitude', 'hdpData.homeInfo.price',
       'hdpData.homeInfo.bathrooms', 'hdpData.homeInfo.bedrooms',
       'hdpData.homeInfo.livingArea', 'hdpData.homeInfo.homeType',
       'hdpData.homeInfo.homeStatus', 'hdpDa

In [33]:
# Pick the Columns I want to Keep
columns_to_keep = [

    'zpid', 
    'marketingStatusSimplifiedCd', 
    'statusText', 
    'address', 
    'beds', 
    'baths', 
    'area', 
    'flexFieldText',
    'flexFieldType', 
    'timeOnZillow', 
    'latLong.latitude', 
    'latLong.longitude',
    'hdpData.homeInfo.streetAddress',
    'hdpData.homeInfo.zipcode', 
    'hdpData.homeInfo.city',
    'hdpData.homeInfo.state', 
    'hdpData.homeInfo.price',
    'hdpData.homeInfo.livingArea', 
    'hdpData.homeInfo.homeType',
    'hdpData.homeInfo.taxAssessedValue',
    'hdpData.homeInfo.lotAreaValue', 
    'hdpData.homeInfo.lotAreaUnit',
    'hdpData.homeInfo.datePriceChanged', 
    'hdpData.homeInfo.zestimate',
    'hdpData.homeInfo.rentZestimate',
    'hdpData.homeInfo.priceChange', 
    'detailUrl', 
    'streetViewURL', 
    'streetViewMetadataURL',
     ]

df[columns_to_keep]
df = df[columns_to_keep]

## Rename Columns

In [34]:
column_rename = {

    'zpid' : 'zipid', 
    'marketingStatusSimplifiedCd' : 'sellertype', 
    'statusText' : 'marketstatus', 
    'address' : 'fulladdress', 
    'beds' : 'beds', 
    'baths' : 'baths', 
    'area' : 'sqft', 
    'flexFieldText' : 'update',
    'flexFieldType' : 'updatetype', 
    'timeOnZillow' : 'timeonzillow', 
    'latLong.latitude' : 'latitude', 
    'latLong.longitude' : 'longitude',
    'hdpData.homeInfo.streetAddress' : 'streetaddress',
    'hdpData.homeInfo.zipcode' : 'zipcode', 
    'hdpData.homeInfo.city' : 'city',
    'hdpData.homeInfo.state' : 'state', 
    'hdpData.homeInfo.price' : 'price',
    'hdpData.homeInfo.livingArea' : 'livingarea', 
    'hdpData.homeInfo.homeType' : 'hometype',
    'hdpData.homeInfo.taxAssessedValue' : 'taxassessedvalue',
    'hdpData.homeInfo.lotAreaValue' : 'lotareavalue', 
    'hdpData.homeInfo.lotAreaUnit' : 'lotareaunit',
    'hdpData.homeInfo.datePriceChanged' : 'datepricechange', 
    'hdpData.homeInfo.zestimate' : 'zestimate',
    'hdpData.homeInfo.rentZestimate' : 'rentzestimate',
    'hdpData.homeInfo.priceChange' : 'pricechange', 
    'detailUrl' : 'url', 
    'streetViewURL' : 'streetviewurl', 
    'streetViewMetadataURL' : 'streetviewmetadataurl',


}

# Rename columns
df = df.rename(columns=column_rename)

## Convert Sqft to Acres

In [35]:
df = convert_sqft_to_acres(df)

## Convert Datatypes

In [36]:
columns_to_convert = ['zipid', 'timeonzillow', 'zipcode', 'price', 'livingarea', 'taxassessedvalue', 'zestimate', 'rentzestimate', ]

df[columns_to_convert] = df[columns_to_convert].fillna(0).astype(int)


df[columns_to_convert] = df[columns_to_convert].astype(int)

In [37]:
df = df[df['marketstatus'] != 'For Rent']
df

Unnamed: 0,zipid,sellertype,marketstatus,fulladdress,beds,baths,sqft,update,updatetype,timeonzillow,...,taxassessedvalue,lotareavalue,lotareaunit,datepricechange,zestimate,rentzestimate,pricechange,url,streetviewurl,streetviewmetadataurl
0,26683152,For Sale by Agent,Condo for sale,"4044 Buena Vista St APT 218, Dallas, TX 75204",2.0,3.0,1085.0,Showcase,showcase,1208443000,...,254980,0.861,acres,,0,0,,/homedetails/4044-Buena-Vista-St-APT-218-Dalla...,,
1,26809367,For Sale by Agent,House for sale,"3223 Whitehall Dr, Dallas, TX 75229",4.0,3.0,1988.0,Showcase,showcase,370715000,...,475730,0.298,acres,1.713856e+12,535200,3300,-17000.0,/homedetails/3223-Whitehall-Dr-Dallas-TX-75229...,,
2,26903542,For Sale by Agent,House for sale,"10777 Strait Ln, Dallas, TX 75229",6.0,11.0,17679.0,13 days on Zillow,daysOnZillow,1172569000,...,10448460,3.304,acres,,36954700,69738,,/homedetails/10777-Strait-Ln-Dallas-TX-75229/2...,,
3,26813671,For Sale by Agent,House for sale,"5525 Glen Forest Ln, Dallas, TX 75241",4.0,4.0,2910.0,265 days on Zillow,daysOnZillow,-2147483648,...,490320,2.118,acres,1.717225e+12,283300,2829,-99000.0,/homedetails/5525-Glen-Forest-Ln-Dallas-TX-752...,,
4,96527896,For Sale by Agent,House for sale,"4002 Agnes St, Dallas, TX 75210",4.0,3.0,1653.0,Showcase,showcase,-2147483648,...,243480,0.130,acres,,0,2365,,/homedetails/4002-Agnes-St-Dallas-TX-75210/965...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,26730486,For Sale by Agent,House for sale,"1614 Engle Ave, Dallas, TX 75224",3.0,2.0,1308.0,5 hours ago,timeOnInfo,18094000,...,180250,0.175,acres,,351500,2100,,/homedetails/1614-Engle-Ave-Dallas-TX-75224/26...,,
497,26788275,For Sale by Agent,Multi-family home for sale,"3519 Nogales Dr, Dallas, TX 75220",4.0,3.0,1858.0,98 days on Zillow,daysOnZillow,-2147483648,...,197400,0.103,acres,1.714028e+12,0,0,-500.0,/homedetails/3519-Nogales-Dr-Dallas-TX-75220/2...,,
498,26702531,For Sale by Agent,House for sale,"6240 Marquita Ave, Dallas, TX 75214",3.0,3.0,2601.0,23 days on Zillow,daysOnZillow,2068957000,...,737380,0.185,acres,1.717225e+12,893500,6035,-50000.0,/homedetails/6240-Marquita-Ave-Dallas-TX-75214...,,
499,26825874,For Sale by Agent,House for sale,"1627 Heather Glen Dr, Dallas, TX 75232",3.0,2.0,1632.0,30 days on Zillow,daysOnZillow,-2147483648,...,89770,0.183,acres,1.713424e+12,287900,1988,100.0,/homedetails/1627-Heather-Glen-Dr-Dallas-TX-75...,,


## Download Data Frame as a CSV

In [38]:
df.to_csv('recently_listed_houses.csv')