In [1]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: loopnet-rent-process.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Sun May 10 2020
#
# DESC: This code processes the rent data downloaded from loopnet to put it into a 
#       single dataset.
#
# EXEC:
#      
################################################################################
################################################################################

In [2]:
################################################################################

import pandas as pd
import sqlalchemy as db

import numpy as np

import json
import os
import re

################################################################################

In [3]:
############################### Constants ######################################

output_folder_path = '/home/user/projects/urban/data/input/Rent/loopnet/retail'
clean_rent_folder_path = '/home/user/projects/urban/data/output/rent'

################################################################################

In [4]:
# List with all files in the output directory
files = [f for f in os.listdir(output_folder_path) if os.path.isfile(os.path.join(output_folder_path, f))]

In [5]:
digit_pattern = re.compile(r'\d')
datasets = []
# Iterate through files, building tables with rent offers
for f in files:
    zip_tables = []
    zip_code = f.split('-')[1]
    # Read tables with listings
    tables = pd.read_html(os.path.join(output_folder_path, f),
                          attrs = {'class': 'listingAttributes'})
    for t in tables:
        t = t.transpose()
        t.columns = [n.strip(':') for n in t.iloc[0]]
        t = t.drop(t.index[0]).reset_index(drop = True)
        zip_tables.append(t)
    zip_table = pd.concat(zip_tables)
    zip_table.dropna(subset = ['Price'], inplace = True)
    zip_table['zip_code'] = zip_code
    mask = zip_table['Price'].apply(lambda x: bool(re.search(digit_pattern, x)))
    zip_table = zip_table[mask]
    datasets.append(zip_table)

In [6]:
loopnet_data = pd.concat(datasets).reset_index(drop = True)

In [7]:
loopnet_data.shape

(47023, 9)

In [8]:
loopnet_data

Unnamed: 0,Status,Price,Property Type,Sub-Type,Spaces,Space Available,Building Size,zip_code,Lot Size
0,For Lease,$2.92 SF/Mo,Multifamily,Retail,1 Space,"1,478 - 10,752 SF","35,220 SF",07666,
1,For Lease,$2.55 SF/Mo,Retail,Office/Retail,1 Space,785 SF,"3,200 SF",07666,
2,For Lease,$2.75 SF/Mo,Retail,Retail,4 Spaces,"1,516 - 6,331 SF","6,500 SF",07666,
3,For Lease,$2.25 SF/Mo,Retail,Retail,1 Space,"1,200 SF","9,600 SF",07666,
4,For Lease,$2.11 SF/Mo,Retail,Retail,1 Space,475 SF,"2,907 SF",07666,
...,...,...,...,...,...,...,...,...,...
47018,For Lease,$1.54 SF/Mo,Retail,Retail,1 Space,"3,785 SF","20,000 SF",63143,
47019,For Lease,$1.00 SF/Mo,Industrial,Retail,2 Spaces,"4,000 SF","70,000 SF",63143,
47020,For Lease,$1.67 SF/Mo,,Retail,1 Space,"1,200 SF","10,595 SF",30068,
47021,For Lease,$2.17 - $2.92 SF/Mo,,Retail,3 Spaces,"1,000 - 7,580 SF","70,314 SF",30068,


In [9]:
# Check that all prices are in $ per SF per Month
sf_mo_pattern = re.compile(r'SF/Mo')
test = loopnet_data['Price'].apply(lambda x: bool(re.search(sf_mo_pattern, x)))
assert test.sum() == loopnet_data.shape[0]

sf_pattern = re.compile(r'SF')
test = loopnet_data['Space Available'].apply(lambda x: bool(re.search(sf_pattern, x)))
assert test.sum() == loopnet_data.shape[0]

In [14]:
# Parse the prices
numbers = re.compile(r'(\d+)(\.\d+)?')
loopnet_data['rate'] = loopnet_data['Price'].apply(lambda row: np.mean([float(''.join(x)) for x in numbers.findall(row)]))
# Parse the sizes
sq_footage = re.compile(r'\d+')
loopnet_data['footage'] = loopnet_data['Space Available'
                                      ].apply(lambda row: np.mean([float(x) 
                                                                   for x in sq_footage.findall(row.replace(',',''))]))

In [23]:
loopnet_data['source'] = 'loopnet'

In [19]:
loopnet_data.rename(columns = {'Property Type': 'type', 'Sub-Type': 'subtype'}, 
                    inplace = True)

In [None]:
loopnet_data[['zip_code', 'type', 'subtype', 'rate', 'footage']]

In [83]:
loopnet_by_zip = loopnet_data.groupby('zip_code').aggregate({'rate': np.median})

In [93]:
loopnet_by_zip.quantile(q = 0.9)

rate    2.5
Name: 0.9, dtype: float64

In [24]:
loopnet_data[['source',
              'zip_code', 
              'type', 
              'subtype', 
              'rate', 
              'footage']
            ].to_csv(os.path.join(clean_rent_folder_path,
                                  'loopnet_listings.csv'),
                     index = False)