In [None]:
import glob
import gzip
import tqdm.notebook as tq
import json
import pandas as pd
import os
import re
import numpy as np
print("reading CSV file...")

# reading and combining all of the files 
## (note: safegraph provided the dataset in multiple .gz and .csv folders)
ZIPFILES='*.gz'
filelist = glob.glob(ZIPFILES)

df_select = pd.DataFrame()
for gzfile in tq.tqdm(filelist):
    if len(df_select)==0:
        df_select=pd.read_csv(gzfile)
        ## only selecting columns of interest
        df_select = df_select[["placekey","location_name","date_range_start","visitor_home_aggregation"]]
    else:
        data = pd.read_csv(gzfile)
        ## only selecting columns of interest
        data = data[["placekey","location_name","date_range_start","visitor_home_aggregation"]]
        df_select=pd.concat([data,df_select])
print("csv file reading completed")

## converting elements of columns them to its rightful type (i.e., dictionary for "visitor_home_aggregation" & str for "placekey")
df_select["date_range_start"]=[str((str(i))[:7]) for i in tq.tqdm(df_select.date_range_start)]
df_select["visitor_home_aggregation"]=[json.loads(i) for i in tq.tqdm(df_select.visitor_home_aggregation)]
df_select["location_name"]=[str(i) for i in tq.tqdm(df_select.location_name)]
df_select["location_name"]=[(re.sub(r'[^\w\s]', '', i)) for i in tq.tqdm(df_select.location_name)]
df_select["location_name"]=[i.lower() for i in tq.tqdm(df_select.location_name)]

# inspecting all the stores and removing POIs that have been classified wrongly as grocery stores. 
## some stores that belong to the fast-food category are classified as full-service here, we shall remove them. 
list_of_unwanted=["moes southwest grill","culvers","bojangles","panera bread","marcos pizza","raising canes"]

print("Removing wrongly classified POIs")
df_select=df_select[~df_select['location_name'].isin(list_of_unwanted)]
print("Wrongly classified POIs removed!")


df_select=df_select.sort_values(by=["date_range_start"])
df_select=df_select.reset_index()
df_select=df_select.drop(columns=["index"])
print("The total number of restaurant chain names selected is:",len(list(np.unique(np.array(df_select["location_name"])))))
print("The number of restaurant locations selected is:",len(list(np.unique(np.array(df_select["placekey"])))))


# scaling-up the visitation data from census tract to county level
## This function converts the census tract to county's
def conv_to_county(dictionary):
    new_dict={}
    in_the_system=[]
    for i in dictionary.keys():
        #try:
        #    i=str(i)
        #except TypeError:
        #    i="NULL"
        #i=str(i)
        #if len(i)==10:
        #    i="0"+i
        #elif len(i)==11:
        #    i=i
        #else:
        #    i="NULL"
        if str(i[:5]) in in_the_system:
            old_value=int(new_dict[str(i[:5])])
            to_be_added=int(dictionary[str(i)])
            new_value=int(old_value+to_be_added)
            new_dict[str(i[:5])]=new_value
        else:
            new_dict[str(i[:5])]=int(dictionary[i])
            in_the_system.append(str(i[:5]))
    return(new_dict)

## This function compiles the visitations based on county's visitors
def home_county(list_of_tract):
    county_list=[]
    for i in tq.tqdm(range(len(list_of_tract))):
        current=list_of_tract[i]
        new=conv_to_county(current)
        county_list.append(new)
    return(county_list)
print("converting from census-tract to county...")
df_select["home_county"]=home_county(df_select["visitor_home_aggregation"])
print("converted from census-tract to county!")


# transforming the dataframe such we can now see visitors to each store from each county  
def detailed_long_df(old_df):
    location_list,placekey_list,date_range_start_list,county_list,count_list=[],[],[],[],[]
    for i in tq.tqdm(range(len(df_select))):
        location_name,placekey,date_range_start=(df_select["location_name"][i]),(df_select["placekey"][i]),(df_select["date_range_start"][i])
        current_home_county=(df_select["home_county"][i])
        for j in current_home_county.keys():
            # remove visitors from Canada
            try:
                x=int(j) 
            except ValueError:
                j="NULL"
            if j=="NULL":
                count=0
            else:
                county=str(j)
                count=int(current_home_county[j])
            location_list.append(location_name)
            placekey_list.append(placekey)
            date_range_start_list.append(date_range_start)
            county_list.append(county)
            count_list.append(count)
    df_detail = pd.DataFrame(list(zip(location_list,placekey_list,date_range_start_list,county_list,count_list)), columns =['location','placekey','date_range_start','county','count'])
    return(df_detail)

## executing the function above
print("transforming the dataframe...")
df_detail=detailed_long_df(df_select)

## Cleaning the data to prepare for aggregation 
df_detail=df_detail[df_detail["county"].str.contains("NULL")==False] # removing Canada visitors
df_detail["type"]=["full_service_restaurant" for i in tq.tqdm(range(len(df_detail)))]
#df_detail.to_csv('full_service_neat.csv') # saving for safe-measure in case Jupyter collapses
print("dataframe transformed!")

# aggregating county_level visitor stats to grocery stores
print("aggregating county level visitation...")
aggregated=(df_detail.groupby(['date_range_start','county','type']).agg(month_count=('count', 'sum')))
aggregated = aggregated.reset_index()
print("data aggregated!")
print("saving the CSV file...")

# saving csv file
aggregated.to_csv("full_service_restaurant_by_county.csv")
print("CSV file saved!")

reading CSV file...


  0%|          | 0/16 [00:00<?, ?it/s]

csv file reading completed


  0%|          | 0/16126887 [00:00<?, ?it/s]

  0%|          | 0/16126887 [00:00<?, ?it/s]

  0%|          | 0/16126887 [00:00<?, ?it/s]

  0%|          | 0/16126887 [00:00<?, ?it/s]

  0%|          | 0/16126887 [00:00<?, ?it/s]

Removing wrongly classified POIs
Wrongly classified POIs removed!
The total number of restaurant chain names selected is: 364289
The number of restaurant locations selected is: 537242
converting from census-tract to county...


  0%|          | 0/15976351 [00:00<?, ?it/s]

converted from census-tract to county!
transforming the dataframe...


  0%|          | 0/15976351 [00:00<?, ?it/s]

  0%|          | 0/117068135 [00:00<?, ?it/s]

dataframe transformed!
aggregating county level visitation...
