In [1]:
# loading packages
print("Reading Files...")
import glob
import gzip
import tqdm.notebook as tq
import json
import numpy as np
import pandas as pd
import re

# reading and combining all of the files 
## (note: safegraph provided the dataset in multiple .gz and .csv folders)
ZIPFILES='*.gz'
filelist = glob.glob(ZIPFILES)
df = pd.DataFrame()
for gzfile in tq.tqdm(filelist):
    if len(df)==0:
        df=pd.read_csv(gzfile)
    else:
        data = pd.read_csv(gzfile)
        df=pd.concat([data,df])  
print("File read!")
        
# only selecting columns of interest
df_select = df[["placekey","parent_placekey","location_name","street_address","city","region","brands","date_range_start",
               "poi_cbg","visitor_home_aggregation","distance_from_home"]]
df_select=df_select.sort_values(by="date_range_start")

## converting elements of columns them to its rightful type (i.e., dictionary for "visitor_home_aggregation" & str for "placekey")
df_select["visitor_home_aggregation"]=[json.loads(i) for i in tq.tqdm(list(df_select["visitor_home_aggregation"]))]
df_select = df_select.reset_index()
df_select["placekey"]=[str(i) for i in tq.tqdm(df_select["placekey"])]
df_select["date_range_start"]=[str((str(i))[:7]) for i in tq.tqdm(df_select.date_range_start)]


# inspecting all the stores and removing POIs that have been classified wrongly as convenience stores. 
print("identifying wrongly classified grocery stores...")
stores=df_select[["placekey","location_name"]]
stores=stores.drop_duplicates()
## removing punctuation and special characters from the location names of our stores so it can be easier to filter them accordingly
stores["location_name"]=[(re.sub(r'[^\w\s]', '', i)) for i in tq.tqdm(stores["location_name"])]
## lower spacing all of the location names so it can be easy to filter them by location name
stores["location_name"]=[i.lower() for i in tq.tqdm(stores["location_name"])]
stores["location_name"]=[i.split() for i in tq.tqdm(stores["location_name"])]
stores = stores.reset_index()

# removing stores that have been wrongly misclassified
# identifying placekeys of misclassified convenience-stores
black_list=[]
for i in tq.tqdm(range(len(stores))):
    current=stores.location_name[i]
    current_placekey=str(stores.placekey[i])
    
    # terminologies that are clearly not associated with convenience stores. 
    list_of_unwanted=["bar","bars","deli","delis","sandwich","sandwiches","sandwichs","bakery","donut",
                     "donuts","bakerys","bakeries","cafe","cafes","coffee","coffees","pizza","pizzas","bagel","bagels"]
    for j in (current):
        if j in list_of_unwanted and current_placekey not in black_list:
            black_list.append(str(current_placekey))
        else:
            continue
print("length of list of incorrectly mislabelled convenience stores is: ", len(black_list))

## removing our identified and extracted misclassified convenience-stores from our data. 
print("Removing incorrectly classified grocery stores...")
df_select=df_select[~df_select['placekey'].isin(black_list)]     
df_select = df_select.reset_index()
# cleaning the dataframe
df_select=df_select.drop(['level_0', 'index'], axis=1)
print("The total number of convenient store chain names selected is:",len(list(np.unique(np.array(df_select["location_name"])))))
print("The number of convenient store locations selected is:",len(list(np.unique(np.array(df_select["placekey"])))))



# scaling-up the visitation data from census tract to county level
## This function converts the census tract to county's
print("Scaling up visitations from census-tract level to county-level")
def conv_to_county(dictionary):
    new_dict={}
    in_the_system=[]
    for i in dictionary.keys():
        if str(i[:5]) in in_the_system:
            old_value=int(new_dict[str(i[:5])])
            to_be_added=int(dictionary[str(i)])
            new_value=int(old_value+to_be_added)
            new_dict[str(i[:5])]=new_value
        else:
            new_dict[str(i[:5])]=int(dictionary[i])
            in_the_system.append(str(i[:5]))
    return(new_dict)

## This function compiles the visitations based on county's visitors
def home_county(list_of_tract):
    county_list=[]
    for i in tq.tqdm(range(len(list_of_tract))):
        current=list_of_tract[i]
        new=conv_to_county(current)
        county_list.append(new)
    return(county_list)

df_select["home_county"]=home_county(df_select["visitor_home_aggregation"])

# # transforming the dataframe such we can now see visitors to each store from each county  
def detailed_long_df(old_df):
    location_list,placekey_list,date_range_start_list,county_list,count_list=[],[],[],[],[]
    for i in tq.tqdm(range(len(df_select))):
        location_name,placekey,date_range_start=(df_select["location_name"][i]),(df_select["placekey"][i]),(df_select["date_range_start"][i])
        current_home_county=(df_select["home_county"][i])
        for j in current_home_county.keys():
            # remove visitors from Canada
            try:
                x=int(j) 
            except ValueError:
                j="NULL"
            if j=="NULL":
                count=0
            else:
                county=str(j)
                count=int(current_home_county[j])
            location_list.append(location_name)
            placekey_list.append(placekey)
            date_range_start_list.append(date_range_start)
            county_list.append(county)
            count_list.append(count)
    df_detail = pd.DataFrame(list(zip(location_list,placekey_list,date_range_start_list,county_list,count_list)), columns =['location','placekey','date_range_start','county','count'])
    return(df_detail)

## executing the function above
df_detail=detailed_long_df(df_select)

## Cleaning the data to prepare for aggregation 
df_detail=df_detail[df_detail["county"].str.contains("NULL")==False] # removing Canada visitors
df_detail["type"]=["convenient store" for i in tq.tqdm(range(len(df_detail)))]

# aggregating county_level visitor stats to convenient stores
print("Aggregating data...")
aggregated=(df_detail.groupby(['date_range_start','county','type']).agg(month_count=('count', 'sum')))

# saving csv file
aggregated = aggregated.reset_index()
print("Data Aggregated, saving file...")
aggregated.to_csv("convenient_store_aggregated_by_county.csv", index=False)
print("File Saved!")

Reading Files...


  0%|          | 0/2 [00:00<?, ?it/s]

File read!


  0%|          | 0/2092038 [00:00<?, ?it/s]

  0%|          | 0/2092038 [00:00<?, ?it/s]

  0%|          | 0/2092038 [00:00<?, ?it/s]

identifying wrongly classified grocery stores...


  0%|          | 0/64535 [00:00<?, ?it/s]

  0%|          | 0/64535 [00:00<?, ?it/s]

  0%|          | 0/64535 [00:00<?, ?it/s]

  0%|          | 0/64535 [00:00<?, ?it/s]

length of list of incorrectly mislabelled grocery stores is:  5879
Removing incorrectly classified grocery stores...
The total number of convenient store chain names selected is: 24989
The number of convenient store locations selected is: 58121
Scaling up visitations from census-tract level to county-level


  0%|          | 0/1920776 [00:00<?, ?it/s]

  0%|          | 0/1920776 [00:00<?, ?it/s]

  0%|          | 0/15751976 [00:00<?, ?it/s]

Aggregating data...
Data Aggregated, saving file...
File Saved!
