## Connect to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import requests
import re
import numpy as np
import json

from zipfile import ZipFile
# from keplergl import KeplerGl
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

### Collect Monthly POI from SafeGraph

In [None]:
os.chdir('/content/drive/My Drive/safegraph_data/safegraph_monthly_data/')
files = ['patterns-part1.csv.gz','patterns-part2.csv.gz',
         'patterns-part3.csv.gz','patterns-part4.csv.gz']
prior_month_df = []
for f in files:
  prior_month_df.append(pd.read_csv(f, compression='gzip'))
PRIOR_MONTH_DF = pd.concat(prior_month_df)
del files, prior_month_df
PRIOR_MONTH_DF = PRIOR_MONTH_DF[PRIOR_MONTH_DF['region']=='CA'] # 508,974

## Collect LA County Communities (from Los Angeles Almanac)

In [None]:
url = 'http://www.laalmanac.com/communications/cm02_communities.php'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
td_data = soup.find_all('td')


# Extract comminity and zip code data
communities = {}
zipcodes = {}
c = 1
for index in range(len(td_data)):
    if (c % 2) != 0:
        community = td_data[index].text.strip()
        idx = int(index/2)
        communities[idx] = community
    if (c % 2) != 0:
        zip_code = td_data[index+1].text.strip()
        idx = int(index/2)
        zipcodes[idx] = zip_code
    c+=1

In [None]:
la_communities_n_zip = []
cities = []
cities_zips = []
# cities_zips_df = []


for i in zipcodes:
    la_communities_n_zip.append(communities[i] + '---' + zipcodes[i])


for i in la_communities_n_zip:
    city = i.split('---')[0]
    # Below currently in use
    item = re.sub(r"(^Los.Angeles.|\(Los Angeles\)|PO Boxes|\/.*)", "", city.strip())
    
    item = re.sub(r"(^Pasadena.*)", "Pasadena", item)
    item = re.sub(r"(^Alhambra.*)", "Alhambra", item)
    item = re.sub(r"(^Downtown.*)", "Downtown", item)
    item = re.sub(r"(.*Long Beach.*)", "Long Beach", item)
    item = re.sub(r"(Santa Clarita )", "", item)

    # FA
    item = re.sub(r"(Rancho Dominguez.*)", "West Rancho Dominguez", item) # Officially 'West Rancho Dominguez'
    item = re.sub(r"(Los Angeles International Airport.*)", "Los Angeles", item) # ME: get's 'Los Angeles' 
    
    item = re.sub(r"(\(|\))", "", item.strip())
    
    # FA
    item = re.sub(r" $","", item)
    
    cities.append(item)
    
    zipcode = i.split('---')[1]
    cities_zips.append(zipcode)


In [None]:
values = list()
value_set = set()
for key, val in communities.items():
    # Below currently in use
    item = re.sub(r"(^Los.Angeles.|\(Los Angeles\)|PO Boxes|\/.*)", "", val.strip())
    
    item = re.sub(r"(^Pasadena.*)", "Pasadena", item)
    item = re.sub(r"(^Alhambra.*)", "Alhambra", item)
    item = re.sub(r"(^Downtown.*)", "Downtown", item)
    item = re.sub(r"(.*Long Beach.*)", "Long Beach", item)
    item = re.sub(r"(Santa Clarita )", "", item)

    # these are mine
    item = re.sub(r"(Rancho Dominguez.*)", "West Rancho Dominguez", item) # Officially 'West Rancho Dominguez'
    item = re.sub(r"(Los Angeles International Airport.*)", "Los Angeles", item) # ME: get's 'Los Angeles' 
    
    item = re.sub(r"(\(|\))", "", item.strip())
    
    # this is mine
    item = re.sub(r" $","", item)
    
    values.append(item)

values = set(values)
values = list(values)
LA_communities_df = pd.DataFrame(values)
LA_communities_df.columns = ['city']
del values

## Collect SafeGraph Points of Interest Data (Time independent)

In [None]:
os.chdir('/content/drive/My Drive/safegraph_data/SafeGraph_POI_Data/')

filename = 'Archive.zip'
poi = []

with ZipFile(filename, 'r') as zip:
  with zip.open('core_poi-part1.csv.gz') as myfile:
    poi.append(pd.read_csv(myfile, compression='gzip'))
  with zip.open('core_poi-part2.csv.gz') as myfile:
    poi.append(pd.read_csv(myfile, compression='gzip'))
  with zip.open('core_poi-part3.csv.gz') as myfile:
    poi.append(pd.read_csv(myfile, compression='gzip'))
  with zip.open('core_poi-part4.csv.gz') as myfile:
    poi.append(pd.read_csv(myfile, compression='gzip'))
  with zip.open('core_poi-part5.csv.gz') as myfile:
    poi.append(pd.read_csv(myfile, compression='gzip'))

poi_df = pd.concat(poi)
del poi

POI_DF = poi_df[poi_df['region']=='CA']

la_poi = pd.merge(POI_DF, LA_communities_df, on='city', how='inner')

os.chdir('/content/drive/My Drive/safegraph_data/')
la_poi.to_csv('la_poi.csv')

os.chdir('/content/drive/My Drive/safegraph_data/safegraph_monthly_data/')

del poi_df, POI_DF


In [None]:
# la_poi.head()

Unnamed: 0,placekey,safegraph_place_id,parent_placekey,parent_safegraph_place_id,location_name,safegraph_brand_ids,brands,top_category,sub_category,naics_code,latitude,longitude,street_address,city,region,postal_code,iso_country_code,phone_number,open_hours,category_tags,opened_on,closed_on,tracking_opened_since,tracking_closed_since
0,zzw-227@5z4-ztv-2rk,sg:01ba472a875d402e95738e6a93cd6e2f,,,Uplifters Kitchen,,,Restaurants and Other Eating Places,Snack and Nonalcoholic Beverage Bars,722515.0,34.019287,-118.455474,2819 Ocean Park Blvd,Santa Monica,CA,90405,US,13106650000.0,"{ ""Mon"": [[""7:00"", ""17:00""]], ""Tue"": [[""7:00"",...","Coffee Shop,Brunch",,,,2019-07
1,22g-224@5z4-zx3-sdv,sg:252a5b0f518a4660a70e9eb99d524dee,,,Friendly Moving Experts,,,Specialized Freight Trucking,Used Household and Office Goods Moving,484210.0,34.018058,-118.500052,201 Wilshire Blvd,Santa Monica,CA,90401,US,12137850000.0,"{ ""Mon"": [[""7:30"", ""19:30""]], ""Tue"": [[""7:30"",...",,,,,2019-07
2,223-228@5z4-zx2-6zf,sg:2c26e540a5a6476bacd62cf6beb03d2b,,,Loews Hotels,SG_BRAND_56bafb9772a09aeb,Loews Hotels,Traveler Accommodation,Hotels (except Casino Hotels) and Motels,721110.0,34.008854,-118.49276,1700 Ocean Ave,Santa Monica,CA,90401,US,,"{ ""Mon"": [[""0:00"", ""24:00""]], ""Tue"": [[""0:00"",...",,,,,
3,22z-222@5z4-zwz-c3q,sg:47798809eeec425daec38e4502b5fdd2,,,Ray Doktor Psy D,,,Offices of Other Health Practitioners,Offices of Mental Health Practitioners (except...,621330.0,34.021919,-118.496609,1128 7th St Apt 305,Santa Monica,CA,90403,US,14243230000.0,"{ ""Mon"": [[""9:00"", ""19:00""]], ""Tue"": [[""9:00"",...",,,,,2019-07
4,zzy-222@5z4-zwz-wc5,sg:c3d00806c2ad420f93d8134f0cc53f69,zzw-222@5z4-zwz-yn5,sg:038098e7aec04f6488a691626ea23211,Santa Monica Surf School,,,"Sporting Goods, Hobby, and Musical Instrument ...",Sporting Goods Stores,451110.0,34.003081,-118.487822,104 Hollister Ave,Santa Monica,CA,90405,US,13105260000.0,,,,,,2019-07


## Refine prior month data to LA only

In [None]:
PRIOR_MONTH_DF = pd.merge(PRIOR_MONTH_DF, LA_communities_df, on='city', how='inner')
# PRIOR_MONTH_DF.head()

## Average Function

In [None]:
def get_daily_average_for_a_month(data_input):
  """
  Input: dataframe
  Output: list
  Function returns the daily average number of visiits to a particular point of 
    interest for a given month.
  """
  vday = data_input['visits_by_day']
  vday = pd.DataFrame(vday)
  average_list = []
  vday['visits_by_day'] = vday['visits_by_day'].str.split(',')

  for i in vday['visits_by_day']:
    mylist = []
    for x in i:
      x = re.sub("^\[", "", x)
      x = re.sub("\]$", "", x)
      # x = x.replace('[', '')
      # x = x.replace(']', '')
      mylist.append(x)
    mylist = list(map(int, mylist))
    numerator = sum(mylist)
    denominator = len(mylist)
    single_poi_average = numerator / denominator
    average_list.append(single_poi_average)
  
  return average_list

## Obtain average data and finalize file

In [None]:
poi_daily_averages = get_daily_average_for_a_month(PRIOR_MONTH_DF)
PRIOR_MONTH_DF['average_pop'] = poi_daily_averages

# PRIOR_MONTH_DF.sort_values('average_pop', ascending=False).head(3)

In [None]:
population_df = PRIOR_MONTH_DF[['location_name', 'average_pop']].sort_values('average_pop', ascending=False)

In [None]:
population_df['average_pop'] = np.ceil(population_df['average_pop'])

In [None]:
population_dict_1 = {}
for i in population_df.values:
  population_dict_1[i[0]] = i[1]

In [None]:
population_dict_final = {}
# count = 1
for key, val in population_dict_1.items():
  # new_key = key + '--()'
  # new_key = key + f'--(Region:{count})'
  new_key = key
  val = int(val)
  # population_dict_final[new_key] = str(val)
  population_dict_final[new_key] = val
  # count+=1

In [None]:
json = json.dumps(population_dict_final)
f = open('POI_population.json', "w")
f.write(json)
f.close()

## If data already available for most recent month, then just pull directly 

In [None]:
# os.chdir('/content/drive/My Drive/safegraph_data/safegraph_monthly_data/')

# single_poi_export = {}
# POI = "Temple Park Convalescent Hospital"

# f = open('POI_population.json')
# data = json.load(f)

# for key, val in data.items():
#   if re.search(f'^{POI}', key):
#     single_poi_export[key] = val

In [None]:
# os.chdir('/content/drive/My Drive/safegraph_data/SINGLE_POI_SAMPLE')

# with open('POI_population_sample.json', 'w') as json_file:
#   json.dump(single_poi_export, json_file)


## Final files produced here

* POI_population.json
* la_poi.csv

In [None]:
# population_df['average_pop'].sum()