In [1]:
# loading packages
import glob
import gzip
import tqdm.notebook as tq
import json
import numpy as np
import pandas as pd

print("reading files")
# reading and combining all of the files 
## (note: safegraph provided the dataset in multiple .gz and .csv folders)
ZIPFILES='*.gz'
filelist = glob.glob(ZIPFILES)
df_select = pd.DataFrame()
for gzfile in tq.tqdm(filelist):
    if len(df_select)==0:
        df_select=pd.read_csv(gzfile)
        df_select=df_select[["placekey","location_name","street_address","date_range_start","visitor_home_aggregation"]]
        df_select = df_select[df_select['visitor_home_aggregation'].notna()]
    else:
        data = pd.read_csv(gzfile)
        data=data[["placekey","location_name","street_address","date_range_start","visitor_home_aggregation"]]
        data = data[data['visitor_home_aggregation'].notna()]
        df_select=pd.concat([data,df_select])
## reading raising canes which was manually extracted (it was wrongly classified by SafeGraph as a full-service restaurant)
data=pd.read_csv("raising_canes.csv")
data=data[["placekey","location_name","street_address","date_range_start","visitor_home_aggregation"]]
## adding raising canes to our dataset
df_select=pd.concat([data,df_select])
print("files read, converting columns to their appropriate format")


# Cleaning our data 
df_select=df_select.drop_duplicates()
df_select=df_select.reset_index()

## Converting our data columns into its appropriate data-types
### converting dates into monthly format
df_select["date_range_start"]=[str((str(i))[:7]) for i in tq.tqdm(df_select.date_range_start)]
### the visitor home aggregation is supposed to be a dictionary but is somehow read as a string - converting them into dictionary's
df_select["visitor_home_aggregation"]=[json.loads(i) for (i) in tq.tqdm(df_select.visitor_home_aggregation)]
### converting placekeys into strings (some are read as integers)
df_select["placekey"]=[str(i) for i in list(df_select.placekey)]


print("identifying the correctly classified fast-food restaurants...")


# inspecting all the stores and including stores that are relevant to our defintion of fast-food restaurants. 
import re
stores=df_select[["placekey","location_name"]]
stores=stores.drop_duplicates()
stores["location_name"]=[str(i) for i in tq.tqdm(stores["location_name"])]
## removing punctuation and special characters from the location names of our stores so it can be easier to filter them accordingly
stores["location_name"]=[(re.sub(r'[^\w\s]', '', i)) for i in tq.tqdm(stores["location_name"])] 
## lower spacing all of the location names so it can be easy to filter them by location name
stores["location_name"]=[i.lower() for i in tq.tqdm(stores["location_name"])]
stores["placekey"]=[str(i) for i in stores["placekey"]]
stores = stores.reset_index()
stores=stores.drop(["index"],axis=1)

## The list of our predefined fast-food restaurants. 
fast_food_list=["mcdonald","mcdonalds","innout burger","starbuck","starbucks","chickfila","taco bell","wendys",
                "burger king","dunkin","subway","dominos pizza","chipotle mexican grill","sonic",
               "panera bread","pizza hut","kfc","kentucky fried chicken","popeyes louisiana kitchen",
               "popeyes","arbys","little caesars","dairy queen","panda express","jack in the box","papa johns",
                "whataburger","wingstop","zaxbys","hardees","culvers","jimmy johns","raising canes","five guys",
               "jersey mikes","carls jr","bojangles","el pollo loco","del taco","rallys drivein restaurants",
               "checkers drivein restaurants","firehouse subs","krispy kreme", "krispy kreme doughnuts","papa murphys",
               "steak n shake","qdoba mexican grill","qdoba","marcos pizza","churchs chicken","tim hortons",
                "tropical smoothie café","tropical smoothie cafe","freddys frozen custard","mcalisters deli","baskin robbins","white castle",
               "moes southwest grill"]
## Here we extract the list of placekeys (or store locations) in which the brand name of the placekeys are that of our pre-defined fast-food restaurants. 
placekey_list,name_list=[],[]
for i in tq.tqdm(range(len(stores))):
    name=str(stores.location_name[i])
    current_placekey=str(stores.placekey[i])
    if name in fast_food_list:
        name_list.append(name)
        placekey_list.append(current_placekey)
    else:
        continue


print("Extracting correctly classified fast-food restaurants")

## Extracting correctly classified fast-food restaurants from our data.
placekey_list = [*set(placekey_list)]
placekey_list=[str(i) for i in placekey_list]
df_select=df_select.loc[df_select['placekey'].isin(placekey_list)] 
df_select = df_select.reset_index()

print("The data after extracting correctly classified fast-food restaurants looks like this: \n" ,df_select)
print("The total number of fast-food restaurant chain names selected is:",len(list(np.unique(np.array(df_select["location_name"])))))
print("The number of fast-food restaurant locations selected is:",len(list(np.unique(np.array(df_select["placekey"])))))

# scaling-up the visitation data from census tract to county level

## This function converts the census tract to county's
def conv_to_county(dictionary):
    new_dict={}
    in_the_system=[]
    for i in dictionary.keys():
        if str((str(i))[:5]) in in_the_system:
            old_value=int(new_dict[str((str(i))[:5])])
            to_be_added=int(dictionary[str(i)])
            new_value=int(old_value+to_be_added)
            new_dict[str((str(i))[:5])]=new_value
        else:
            new_dict[str((str(i))[:5])]=int(dictionary[i])
            in_the_system.append((str(str(i))[:5]))
    return(new_dict)

## This function compiles the visitations based on county's visitors
def home_county(list_of_tract):
    county_list=[]
    for i in tq.tqdm(range(len(list_of_tract))):
        current=list_of_tract[i]
        new=conv_to_county(current)
        county_list.append(new)
    return(county_list)

print("converting our visitation dataset from census-tract level to county-level...")
df_select["home_county"]=home_county(df_select["visitor_home_aggregation"])
print("converted from census-tract to county!")


# transforming the dataframe such we can now see visitors to each store from each county 
def detailed_long_df(old_df):
    location_list,placekey_list,date_range_start_list,county_list,count_list=[],[],[],[],[]
    for i in tq.tqdm(range(len(df_select))):
        location_name,placekey,date_range_start=(df_select["location_name"][i]),(df_select["placekey"][i]),(df_select["date_range_start"][i])
        current_home_county=(df_select["home_county"][i])
        for j in current_home_county.keys():
            ## removing visitors from Canada
            try:
                x=int(j) 
            except ValueError:
                j="NULL"
            if j=="NULL":
                count=0
            else:
                county=str(j)
                count=int(current_home_county[j])
            location_list.append(location_name)
            placekey_list.append(placekey)
            date_range_start_list.append(date_range_start)
            county_list.append(county)
            count_list.append(count)
    df_detail = pd.DataFrame(list(zip(location_list,placekey_list,date_range_start_list,county_list,count_list)), columns =['location','placekey','date_range_start','county','count'])
    return(df_detail)

## executing the function above
print("transforming the dataframe...")
df_detail=detailed_long_df(df_select)

## Cleaning the data to prepare for aggregation 
df_detail=df_detail[df_detail["county"].str.contains("NULL")==False] # removing Canada visitors
df_detail["type"]=["fast_food_restaurant" for i in tq.tqdm(range(len(df_detail)))]
print("dataframe transformed!")

# Aggregating the data to get the total number of visits from each county to all fast-food restaurants for each month. 
print("aggregating county level visitation...")
aggregated=(df_detail.groupby(['date_range_start','county','type']).agg(month_count=('count', 'sum')))
aggregated = aggregated.reset_index()
print("data aggregated!")
print("saving the CSV file...")

# saving csv file
aggregated.to_csv("fast_food_by_county.csv")
print("CSV file saved!")

reading files


  0%|          | 0/11 [00:00<?, ?it/s]

files read, converting columns to their appropriate format


  0%|          | 0/9279520 [00:00<?, ?it/s]

  0%|          | 0/9279520 [00:00<?, ?it/s]

identifying the correctly classified fast-food restaurants...


  0%|          | 0/289628 [00:00<?, ?it/s]

  0%|          | 0/289628 [00:00<?, ?it/s]

  0%|          | 0/289628 [00:00<?, ?it/s]

  0%|          | 0/289628 [00:00<?, ?it/s]

Extracting correctly classified fast-food restaurants
The data after extracting correctly classified fast-food restaurants looks like this: 
          level_0   index             placekey   location_name  \
0              0       0  223-222@8f2-d56-835  Raising Cane's   
1              1       1  223-224@646-hpx-hyv  Raising Cane's   
2              2       2  zzy-223@8dx-77q-sqz  Raising Cane's   
3              3       3  zzw-223@63v-b6v-sdv  Raising Cane's   
4              4       4  zzw-222@8t8-fn6-pvz  Raising Cane's   
...          ...     ...                  ...             ...   
5959032  9279513  989670  zzy-226@65z-fmn-q4v          Subway   
5959033  9279515  989672  zzy-227@8sz-dv5-vs5     Burger King   
5959034  9279516  989673  zzy-228@5pv-sw3-7t9          Subway   
5959035  9279518  989675  zzy-228@8t2-8k5-7h5          Subway   
5959036  9279519  989676  zzy-22b@627-zcy-w8v          Subway   

                    street_address date_range_start  \
0                  239

  0%|          | 0/5959037 [00:00<?, ?it/s]

converted from census-tract to county!
transforming the dataframe...


  0%|          | 0/5959037 [00:00<?, ?it/s]

  0%|          | 0/67683792 [00:00<?, ?it/s]

dataframe transformed!
aggregating county level visitation...
data aggregated!
saving the CSV file...
CSV file saved!
