In [None]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: crexi-rent-process.ipynb
#
# BY: Dmitry Sedov 
#
# CREATED: Fri Mar 27 2020
#
# DESC: This code processes the rent data downloaded from crexi to put it into a 
#       single dataset.
#
# EXEC:
#      
################################################################################
################################################################################

In [None]:
############################### Libraries ######################################

import pandas as pd
import sqlalchemy as db

import json
import os
import re

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# R integration
%load_ext rpy2.ipython

################################################################################

In [None]:
############################### Options ########################################

sns.set_palette('tab10')

################################################################################

In [None]:
############################### Constants ######################################

output_folder_path = '/home/user/projects/urban/data/input/Rent/Crexi/retail'
clean_rent_folder_path = '/home/user/projects/urban/data/output/rent'

################################################################################

In [None]:
files = [f for f in os.listdir(output_folder_path) if os.path.isfile(os.path.join(output_folder_path, f))]

In [None]:
datasets = []
for f in files:
    with open(os.path.join(output_folder_path, f), 'r') as the_file:
        test = json.load(the_file)
        data = pd.DataFrame(test['Data'])
        data['zip_code'] = f[3:8]
        datasets.append(data)

In [None]:
data = pd.concat(datasets)
subdata = data[data['RateMonthly'].notnull()].copy()
subdata.drop_duplicates(inplace = True)
subdata = subdata[['zip_code', 'Description', 'RateMonthly']]
subdata.reset_index(drop = True, inplace = True)

In [None]:
subdata.shape

In [None]:
# Clean the rent data
numbers = re.compile(r'(\d+)(\.\d+)?')
subdata['rate'] = subdata['RateMonthly'].apply(lambda row: np.mean([float(''.join(x)) for x in numbers.findall(row)]))

In [None]:
# Get the type
def extract_first(x):
    try:
        return x.split('|')[0]
    except AttributeError:
        return ''
subdata['type'] = subdata['Description'].apply(extract_first)

sq_ft_pattern = re.compile('\|[\d,\s]+sq\. ft\.[^\|$]*')
def extract_footage(x):
    try:
        footage = sq_ft_pattern.findall(str(x))[0].strip(' |')
        return footage
    except IndexError:
        return ''
subdata['footage'] = subdata['Description'].apply(extract_footage)

sq_footage = re.compile(r'\d+')
subdata['footage'] = subdata['footage'
                            ].apply(lambda row: np.mean([float(x) 
                                                         for x in sq_footage.findall(row.replace(',',''))]))

subdata['source'] = 'crexi'

In [None]:
subdata[['source', 'zip_code', 'type', 'rate', 'footage']]

In [None]:
subdata[['source', 
         'zip_code', 
         'type', 
         'rate', 
         'footage']
       ].to_csv(os.path.join(clean_rent_folder_path, 'crexi_listings.csv'))

In [None]:
zip_rent = subdata.groupby(['zip_code']).aggregate({'rate': 'mean'})

In [None]:
zip_rent.reset_index(inplace = True)

In [None]:
zip_rent

In [None]:
######################### Database connections #################################

# Connect to the database via SQLalchemy                                                                           
engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')                                                       
connection = engine.connect()                                                                                      

################################################################################

In [None]:
# Create a table with mean restaurant visits in zip_code
zip_restaurants_table_statement = """
WITH month_restaurants AS (
    SELECT
        r.sname_place_id,
        r.zip_code,
        v.raw_visit_counts
    FROM
        restaurants AS r
    LEFT JOIN
        visits AS v
    ON  
        r.sname_place_id = v.sname_place_id
    AND
        v.year = 2018 
    AND
        v.month = 10
) 
SELECT 
    zip_code,
    AVG(raw_visit_counts) AS avg_rest_visits
FROM
    month_restaurants
WHERE 
    raw_visit_counts IS NOT NULL
GROUP BY 
    zip_code;
"""

zip_restaurants_table = pd.read_sql(zip_restaurants_table_statement, engine)

In [None]:
####################### Close database connections #############################

engine.dispose()

################################################################################

In [None]:
zip_restaurants_table.head()

In [None]:
merged = pd.merge(zip_restaurants_table, 
                  zip_rent, 
                  how = 'inner', 
                  on = 'zip_code', 
                  validate = 'one_to_one')

In [None]:
merged.head()

In [None]:
# Push the Census Block Groups dataset to R
%Rpush merged

In [None]:
%%R
library(ggplot2)
library(dplyr)
# Function to remove outliers
remove_outliers <- function(x, na.rm = TRUE, ...) {
    qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
    H <- 1 * IQR(x, na.rm = na.rm)
    y <- x
    y[x < (qnt[1] - H)] <- NA
    y[x > (qnt[2] + H)] <- NA
    y
}
# Remove outliers
merged <- merged %>% 
    mutate_at(vars(avg_rest_visits, rate), 
              funs(remove_outliers))

In [None]:
%%R
# Plot the establishments - restaurant visits relationship
ggplot(data = merged, 
       aes(x = avg_rest_visits, 
           y = rate)) + 
    geom_smooth() + 
    stat_summary_bin(fun.y = 'mean', bins = 30, size = 1, geom = 'point') +
    scale_color_brewer(palette = 'Set1') + 
    scale_fill_brewer(palette = 'Set1')

In [None]:
help(numbers.findall)

In [None]:
sq_ft_pattern.findall(subdata.loc[0, 'Description'])

In [None]:
subdata.loc[0, 'Description']

In [None]:
help(str.strip)