In [1]:
import pandas as pd

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('listings_raw_sold.csv')

## Functions

In [3]:
def convert_sqft_to_acres(df):
    # Conversion factor from square feet to acres
    sqft_to_acres = 1 / 43560

    # Using the .loc accessor to modify rows where the Lot Area Unit is 'sqft'
    # Convert and round the area to three decimal places
    df.loc[df['lotareaunit'] == 'sqft', 'lotareavalue'] = (df['lotareavalue'] * sqft_to_acres).round(3)
    df.loc[df['lotareaunit'] == 'sqft', 'lotareaunit'] = 'acres'

    df.loc[df['lotareaunit'] == 'acres', 'lotareavalue'] = (df['lotareavalue']).round(3)
    
    return df

## Initial Data Frame

In [4]:
df

Unnamed: 0,zpid,rawHomeStatusCd,marketingStatusSimplifiedCd,imgSrc,hasImage,detailUrl,statusType,statusText,price,priceLabel,...,hdpData.homeInfo.priceChange,streetViewURL,streetViewMetadataURL,hdpData.homeInfo.listing_sub_type.is_FSBO,hdpData.homeInfo.priceForHDP,hdpData.homeInfo.videoCount,availabilityDate,hdpData.homeInfo.isRentalWithBasePrice,hdpData.homeInfo.listing_sub_type.is_FSBA,plid
0,26684051.0,RecentlySold,RecentChange,https://photos.zillowstatic.com/fp/31c63c72d40...,True,/homedetails/5538-Junius-St-Dallas-TX-75214/26...,SOLD,Sold,,$--,...,,,,,,,,,,
1,26807390.0,RecentlySold,RecentChange,https://photos.zillowstatic.com/fp/b84e3e47076...,True,/homedetails/3619-Norcross-Ln-Dallas-TX-75229/...,SOLD,Sold,,$--,...,,,,,,,,,,
2,26867404.0,RecentlySold,RecentChange,https://photos.zillowstatic.com/fp/a215304d0f6...,True,/homedetails/9545-Covemeadow-Dr-Dallas-TX-7523...,SOLD,Sold,,$--,...,,,,,,,,,,
3,26873685.0,RecentlySold,RecentChange,https://photos.zillowstatic.com/fp/810a0add243...,True,/homedetails/6879-Anglebluff-Cir-Dallas-TX-752...,SOLD,Sold,,$--,...,,,,,,,,,,
4,303983624.0,RecentlySold,RecentChange,https://photos.zillowstatic.com/fp/2c74fc22dc3...,True,/homedetails/639-Aspen-Valley-Ln-Dallas-TX-752...,SOLD,Sold,,$--,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,26760452.0,RecentlySold,RecentChange,https://photos.zillowstatic.com/fp/a0c077602d5...,True,/homedetails/5761-Greenbrier-Dr-Dallas-TX-7520...,SOLD,Sold,,$--,...,,,,,,,,,,
496,26806884.0,RecentlySold,RecentChange,https://photos.zillowstatic.com/fp/8f60edbed24...,True,/homedetails/10315-Gooding-Dr-Dallas-TX-75229/...,SOLD,Sold,,$--,...,,,,,,,,,,
497,26817284.0,RecentlySold,RecentChange,https://photos.zillowstatic.com/fp/9552ab333f8...,True,/homedetails/9625-Silver-Meadow-Dr-Dallas-TX-7...,SOLD,Sold,,$--,...,,,,,,,,,,
498,26822652.0,RecentlySold,RecentChange,https://photos.zillowstatic.com/fp/02d02bd99fe...,True,/homedetails/6525-Leaning-Oaks-St-Dallas-TX-75...,SOLD,Sold,,$--,...,,,,,,,,,,


## Columns to Keep in Data Frame

In [5]:
df.columns

Index(['zpid', 'rawHomeStatusCd', 'marketingStatusSimplifiedCd', 'imgSrc',
       'hasImage', 'detailUrl', 'statusType', 'statusText', 'price',
       'priceLabel', 'address', 'beds', 'baths', 'area', 'isUserClaimingOwner',
       'isUserConfirmedClaim', 'pgapt', 'sgapt', 'shouldShowZestimateAsPrice',
       'has3DModel', 'hasVideo', 'isHomeRec', 'hasAdditionalAttributions',
       'isFeaturedListing', 'isShowcaseListing', 'listingType', 'isFavorite',
       'visited', 'timeOnZillow', 'latLong.latitude', 'latLong.longitude',
       'variableData.type', 'variableData.text', 'hdpData.homeInfo.zpid',
       'hdpData.homeInfo.streetAddress', 'hdpData.homeInfo.zipcode',
       'hdpData.homeInfo.city', 'hdpData.homeInfo.state',
       'hdpData.homeInfo.latitude', 'hdpData.homeInfo.longitude',
       'hdpData.homeInfo.dateSold', 'hdpData.homeInfo.bathrooms',
       'hdpData.homeInfo.bedrooms', 'hdpData.homeInfo.livingArea',
       'hdpData.homeInfo.homeType', 'hdpData.homeInfo.homeStatus',
  

In [6]:
# Pick Columns I Want to Keep
columns_to_keep = [

    'zpid', 
    'statusText', 
    'address', 
    'beds', 
    'baths', 
    'area', 
    'timeOnZillow', 
    'latLong.latitude', 
    'latLong.longitude',
    'hdpData.homeInfo.streetAddress', 
    'hdpData.homeInfo.zipcode',
    'hdpData.homeInfo.city', 
    'hdpData.homeInfo.state',
    'hdpData.homeInfo.dateSold', 
    'hdpData.homeInfo.livingArea',
    'hdpData.homeInfo.homeType', 
    'hdpData.homeInfo.zestimate',
    'hdpData.homeInfo.rentZestimate', 
    'hdpData.homeInfo.taxAssessedValue', 
    'hdpData.homeInfo.lotAreaValue',
    'hdpData.homeInfo.lotAreaUnit',
    'detailUrl',

     ]

df[columns_to_keep]
df = df[columns_to_keep]

## Rename Columns

In [7]:
column_rename = {

    'zpid' : 'zipid', 
    'statusText' : 'status', 
    'address' : 'fulladdress', 
    'beds' : 'beds', 
    'baths' : 'baths', 
    'area' : 'sqft', 
    'timeOnZillow' : 'timeonzillow', 
    'latLong.latitude' : 'latitude', 
    'latLong.longitude' : 'longitude',
    'hdpData.homeInfo.streetAddress' : 'streetaddress', 
    'hdpData.homeInfo.zipcode' : 'zipcode',
    'hdpData.homeInfo.city' : 'city', 
    'hdpData.homeInfo.state' : 'state',
    'hdpData.homeInfo.dateSold'  : 'datesold', 
    'hdpData.homeInfo.livingArea' : 'livingarea',
    'hdpData.homeInfo.homeType' : 'HomeType', 
    'hdpData.homeInfo.zestimate' : 'zestimate',
    'hdpData.homeInfo.rentZestimate' : 'rentzestimate', 
    'hdpData.homeInfo.taxAssessedValue' : 'taxassessedvalue', 
    'hdpData.homeInfo.lotAreaValue' : 'lotareavalue',
    'hdpData.homeInfo.lotAreaUnit' : 'lotareaunit',
    'detailUrl' : 'url',


}

# Rename columns
df = df.rename(columns=column_rename)

## Convert to Sqft to Acres

In [8]:
df = convert_sqft_to_acres(df)

## Convert Datatypes

In [9]:
columns_to_convert = ['zipid', 'sqft', 'datesold', 'livingarea', 'zestimate', 'rentzestimate', 'taxassessedvalue', 'zipcode']

df[columns_to_convert] = df[columns_to_convert].fillna(0).astype(int)

df[columns_to_convert] = df[columns_to_convert].astype(int)

## Filter out "For Rent"

In [10]:
df = df[df['status'] != 'For Rent']

## Final Cleaned Data Frame

In [11]:
df

Unnamed: 0,zipid,status,fulladdress,beds,baths,sqft,timeonzillow,latitude,longitude,streetaddress,...,state,datesold,livingarea,HomeType,zestimate,rentzestimate,taxassessedvalue,lotareavalue,lotareaunit,url
0,26684051,Sold,"5538 Junius St, Dallas, TX 75214",2.0,3.0,1966,69571000,32.804660,-96.76064,5538 Junius St,...,TX,-2147483648,1966,SINGLE_FAMILY,627200,3611,537170,0.215,acres,/homedetails/5538-Junius-St-Dallas-TX-75214/26...
1,26807390,Sold,"3619 Norcross Ln, Dallas, TX 75229",3.0,2.0,1899,69571000,32.890760,-96.86020,3619 Norcross Ln,...,TX,-2147483648,1899,SINGLE_FAMILY,684800,3264,573280,0.267,acres,/homedetails/3619-Norcross-Ln-Dallas-TX-75229/...
2,26867404,Sold,"9545 Covemeadow Dr, Dallas, TX 75238",5.0,5.0,4679,69571000,32.881622,-96.72469,9545 Covemeadow Dr,...,TX,-2147483648,4679,SINGLE_FAMILY,1551500,9688,540000,0.260,acres,/homedetails/9545-Covemeadow-Dr-Dallas-TX-7523...
3,26873685,Sold,"6879 Anglebluff Cir, Dallas, TX 75248",3.0,3.0,2136,69571000,32.966454,-96.78907,6879 Anglebluff Cir,...,TX,-2147483648,2136,TOWNHOUSE,418200,3300,364980,0.125,acres,/homedetails/6879-Anglebluff-Cir-Dallas-TX-752...
4,303983624,Sold,"639 Aspen Valley Ln, Dallas, TX 75208",3.0,3.0,2592,69571000,32.750940,-96.85299,639 Aspen Valley Ln,...,TX,-2147483648,2592,SINGLE_FAMILY,690000,3999,686750,0.075,acres,/homedetails/639-Aspen-Valley-Ln-Dallas-TX-752...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,26760452,Sold,"5761 Greenbrier Dr, Dallas, TX 75209",4.0,3.0,2694,1538371000,32.858150,-96.81281,5761 Greenbrier Dr,...,TX,-2147483648,2694,SINGLE_FAMILY,1665800,9910,1156550,0.275,acres,/homedetails/5761-Greenbrier-Dr-Dallas-TX-7520...
496,26806884,Sold,"10315 Gooding Dr, Dallas, TX 75229",4.0,3.0,2917,1538371000,32.887215,-96.84372,10315 Gooding Dr,...,TX,-2147483648,2917,SINGLE_FAMILY,976300,5518,624000,0.442,acres,/homedetails/10315-Gooding-Dr-Dallas-TX-75229/...
497,26817284,Sold,"9625 Silver Meadow Dr, Dallas, TX 75217",4.0,2.0,1300,1538371000,32.723980,-96.65588,9625 Silver Meadow Dr,...,TX,-2147483648,1300,SINGLE_FAMILY,275100,1898,115410,0.237,acres,/homedetails/9625-Silver-Meadow-Dr-Dallas-TX-7...
498,26822652,Sold,"6525 Leaning Oaks St, Dallas, TX 75241",3.0,1.0,1260,1538371000,32.658287,-96.76967,6525 Leaning Oaks St,...,TX,-2147483648,1260,SINGLE_FAMILY,185300,1749,190800,0.160,acres,/homedetails/6525-Leaning-Oaks-St-Dallas-TX-75...


## Save Data Frame as a CSV

In [12]:
df.to_csv('recently_sold_houses.csv')