In [None]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: commercialexchange-retail-rent-process.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Sun Apr 12 2020
#
# DESC: This code processes rent data downloaded from commercialexchange.com.
#
# EXEC:
#      
################################################################################
################################################################################

In [None]:
############################### Libraries ######################################

import pandas as pd
import sqlalchemy as db

import os
import us

import json
import re

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# R integration
%load_ext rpy2.ipython

################################################################################

In [None]:
######################### Constants and settings ###############################

output_folder_path = '/home/user/projects/urban/data/input/Rent/commercialexchange/retail/results'
clean_rent_folder_path = '/home/user/projects/urban/data/output/rent'

pd.options.display.max_rows = 999

################################################################################

In [None]:
# Get all files 
files = [f for f in os.listdir(output_folder_path) if os.path.isfile(os.path.join(output_folder_path, f))]
# Import the data from files
datasets = []
for f in files:
    with open(os.path.join(output_folder_path, f), 'r') as the_file:
        try:
            raw = json.load(the_file)
        except (json.JSONDecodeError, UnicodeDecodeError):
            print(f)
            raise
        part = []
        pid = raw['id']
        try:
            zip_code = raw['location']['address']['postalCode']
        except KeyError:
            continue
        space = raw['listedSpaces']
        if (not space) or (not 'lease' in space[0]):
            continue
        try:
            size = space[0]['space']['size']['available']
        except KeyError:
            size = None
        lease = space[0]['lease']
        if not 'askingRent' in lease:
            continue
        if (not lease) or (not 'askingRent' in lease):
            continue
        rent = lease['askingRent']
        if not rent:
            continue
        rent = rent[0]
        price_min = rent['price']['amount']['minimum']['amount'] if 'minimum' in rent['price']['amount'] else None
        price_max = rent['price']['amount']['maximum']['amount'] if 'maximum' in rent['price']['amount'] else None
        price_period = rent['price']['period']
        try:
            price_size = rent['price']['size']
        except KeyError:
            price_size = None
        part = {'pid': pid, 
                'zip_code': zip_code, 
                'size': size,
                'price_min': price_min, 
                'price_max': price_max,
                'period': price_period, 
                'price_size': price_size}
        datasets.append(part)

In [None]:
# Append data, drop duplicates
data = pd.DataFrame(datasets)
data.drop_duplicates(inplace = True)

In [None]:
data.head()

In [None]:
data['period'].unique()

In [None]:
data['price_size'].unique()

In [None]:
# Unify the price / rent data
data['price'] = data.apply(lambda row: np.nanmean([row['price_min'], row['price_max']]),
                           axis = 1)
data['rate'] = data.apply(lambda row: row['price'] / 12 if row['period'] == 'ANNUAL' else row['price'],
                          axis = 1)
data['rate'] = data.apply(lambda row: row['rate'] / row['size'] if row['price_size'] == 'TOTAL' else row['rate'],
                          axis = 1)

In [None]:
# Drop outliers
data.drop(data[data['rate'] >= 500].index, inplace = True)

In [None]:
# Rename columns
data.rename(columns = {'size': 'footage'}, inplace = True)
# Assign retail type (this is how requests were made)
data['type'] = 'Retail'
# Assign source
data['source'] = 'commercialexchange'

In [None]:
data.head()

In [None]:
data[['source',
      'zip_code',
      'type',
      'rate', 
      'footage']
    ].to_csv(os.path.join(clean_rent_folder_path, 'commercialexchange_listings.csv'))

In [None]:
subdata = data[['zip_code','rent']].groupby('zip_code').aggregate({'rent': 'mean'})

In [None]:
subdata.shape

In [None]:
subdata.reset_index(inplace = True)

In [None]:
######################### Database connections #################################

# Connect to the database via SQLalchemy                                                                           
engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')                                                       
connection = engine.connect()                                                                                      

################################################################################

In [None]:
# Create a table with mean restaurant visits in zip_code
zip_restaurants_table_statement = """
WITH month_restaurants AS (
    SELECT
        r.sname_place_id,
        r.zip_code,
        v.raw_visit_counts
    FROM
        restaurants AS r
    LEFT JOIN
        visits AS v
    ON  
        r.sname_place_id = v.sname_place_id
    AND
        v.year = 2018 
    AND
        v.month = 10
) 
SELECT 
    zip_code,
    AVG(raw_visit_counts) AS avg_rest_visits
FROM
    month_restaurants
WHERE 
    raw_visit_counts IS NOT NULL
GROUP BY 
    zip_code;
"""

zip_restaurants_table = pd.read_sql(zip_restaurants_table_statement, engine)

In [None]:
####################### Close database connections #############################

engine.dispose()

################################################################################

In [None]:
merged = pd.merge(zip_restaurants_table, 
                  subdata, 
                  how = 'inner', 
                  on = 'zip_code', 
                  validate = 'one_to_one')

In [None]:
sns.regplot(x = 'avg_rest_visits', y = 'rent', data = merged)

In [None]:
%Rpush merged

In [None]:
%%R
library(ggplot2)
library(dplyr)
# Function to remove outliers
remove_outliers <- function(x, na.rm = TRUE, ...) {
    qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
    H <- 1 * IQR(x, na.rm = na.rm)
    y <- x
    y[x < (qnt[1] - H)] <- NA
    y[x > (qnt[2] + H)] <- NA
    y
}
# Remove outliers
merged <- merged %>% 
    mutate_at(vars(avg_rest_visits, rent), 
              funs(remove_outliers))
# Plot 
ggplot(data = merged, 
       aes(x = avg_rest_visits, 
           y = rent)) + 
    geom_smooth() + 
    stat_summary_bin(fun.y = 'mean', bins = 30, size = 1, geom = 'point') +
    scale_color_brewer(palette = 'Set1') + 
    scale_fill_brewer(palette = 'Set1')