In [1]:
import numpy as np
import pandas as pd
from keys import all_craigslist_listings_path

output_path = 'data/craigslist_listings_cleaned.csv'

## Load the data

In [2]:
# function to convert string to float and handle empty string as NaN
def to_float(string_value):
    string_value = string_value.strip()
    return np.float(string_value) if string_value else np.nan

# load the full combined data set, converting numeric columns to float using to_float
converters = {'neighborhood':str, 
              'title':str, 
              'price':to_float, 
              'bedrooms':to_float, 
              'pid':str, 
              'date':str, 
              'link':str, 
              'sqft':to_float, 
              'sourcepage':str, 
              'longitude':to_float, 
              'latitude':to_float}

In [3]:
%%time
all_listings = pd.read_csv(all_craigslist_listings_path, converters=converters)
all_listings = all_listings.rename(columns={'price':'rent', 'longitude':'lng', 'latitude':'lat'})
print(len(all_listings))

10958372
Wall time: 1min 40s


## Filter the data

In [4]:
# de-dupe rows by PID
df = all_listings.drop_duplicates(subset='pid')
len(df)

5480435

In [5]:
# retain only rows containing latlng data
mask = (pd.notnull(df['lng'])) & (pd.notnull(df['lat']))
df = df[mask]
len(df)

2335741

In [6]:
# define percentiles for valid values
upper_pctl = 0.998
lower_pctl = 0.002

upper_rent = df['rent'].quantile(q=upper_pctl, interpolation='higher')
lower_rent = df['rent'].quantile(q=lower_pctl, interpolation='lower')

upper_sqft = df['sqft'].quantile(q=upper_pctl, interpolation='higher')
lower_sqft = df['sqft'].quantile(q=lower_pctl, interpolation='lower')

print('valid rent range: {:.0f} - {:.0f}'.format(lower_rent, upper_rent))
print('valid sqft range: {:.0f} - {:.0f}'.format(lower_sqft, upper_sqft))

valid rent range: 100 - 10000
valid sqft range: 220 - 5100


In [7]:
# filter out any rows with rent or sqft outside of the reasonable values (but keep nulls)
rent_mask = ((df['rent'] > lower_rent) & (df['rent'] < upper_rent)) | (pd.isnull(df['rent']))
sqft_mask = ((df['sqft'] > lower_sqft) & (df['sqft'] < upper_sqft)) | (pd.isnull(df['sqft']))
df = pd.DataFrame(df[rent_mask & sqft_mask])
len(df)

2321070

In [8]:
# filter out any rows with duplicate (and non-null) values across all of these columns
not_null = pd.notnull(df['rent']) & pd.notnull(df['bedrooms'])
duplicated = df.duplicated(subset=['rent', 'sqft', 'bedrooms', 'neighborhood'], keep='first')
df = df[~(not_null & duplicated)]
len(df)

1393426

## Create and clean up features

In [9]:
# extract the subdomain/region from the link column
df['region'] = df['link'].str.extract('http://(.*).craigslist.org', expand=False)

In [10]:
# drop unnecessary columns and reindex
df = df.drop(columns=['title', 'neighborhood', 'link', 'sourcepage', 'pid'])
df = df.reindex(columns=['date', 'region', 'bedrooms', 'rent', 'sqft', 'lat', 'lng'])

In [11]:
df.head()

Unnamed: 0,date,region,bedrooms,rent,sqft,lat,lng
1,2014-05-11,santabarbara,3.0,3500.0,1200.0,34.399757,-119.726987
2,2014-05-11,santabarbara,2.0,850.0,882.0,34.411019,-119.855845
4,2014-05-11,santabarbara,1.0,1290.0,,34.410415,-119.85433
5,2014-05-11,santabarbara,1.0,1660.0,,34.44146,-119.754324
6,2014-05-11,santabarbara,2.0,3200.0,,34.410416,-119.85453


## Save to disk

In [12]:
%%time
df.to_csv(output_path, index=False, encoding='utf-8')
print(output_path)

data/craigslist_listings_cleaned.csv
Wall time: 9.78 s
