### Yelp Data Summary
The data collection stores one csv file for each zip code and it is placed in the "Output" folder.  The naming convention of these files is "YelpData_<SearchZipCode>.csv".  The code does the following:
    1 Gets a list of the files within the "Output" folder
    2 Loops through the files that start with "YelpData" and does the following:
      2.1 Read the file into Pandas DataFrame
      2.2 Summarize data; get total records
      2.3 Summarize data; get total records the different rating
      2.4 Summarize data; get total records for the different prices
    3 All of the summarized data are stored in DataFrame
    4 Summarized DataFrame is stored on disk; name "SummarizedYelpData.csv"
    5 Create DataFrame with all of the zip code data and save to disk; named "AllZipcodeYelpData.csv"
    

In [2]:
#-- Import Libraries
import pandas as pd
import os
import math

In [12]:
#-- Get List of Files on Disk
searchFolder = "Output_2"
outputPath = os.path.join(".", searchFolder)

files = os.listdir(outputPath)


#-- Load into Single DataFrame
yelpData_df = None
hasFirstYelpData = False
emptyZipcodes = []
counter = 0

for file in files:
    
    if (file.startswith("YelpData_") == True):
        
        counter += 1
        
        
        #- Get Zipcode
        zipcodeValue = ''
        
        if (len(file) == 17):
            zipcodeValue = file[9:13]
        else:
            zipcodeValue = file[9:14]
        
        
        #- Get DataFrame: Gulten Free
        filePath = os.path.join(".", searchFolder, file)
        
        zipCodeYelpData_df = pd.read_csv(filePath)
        
        zipCodeYelpData_df['IsGlutenFree'] = 1
        
        
        #- Get DataFrame: All Restaurants
        filePathAll = os.path.join(".", searchFolder, f"YelpDataAll_{zipcodeValue}.csv")
        
        zipcodeYelpDataAll_df = pd.read_csv(filePathAll)
        
        
        zipcodeYelpDataAll_df['IsGlutenFree'] = 0
        
        
        #- Merge both Dataframes
        zipCodeYelpData_df = pd.concat([zipCodeYelpData_df, zipcodeYelpDataAll_df])
        
        
        #- Check for Businesses
        # Large number of zipcodes do not have any businesses that satisfy the filter, store the zipcode
        # and add to summary dataframe with zeroes
        if (zipCodeYelpData_df.shape[0] == 0):
            emptyZipcodes.append(zipcodeValue)
            
        else:
            
            #- Create One DataFrame
            if (hasFirstYelpData == False):
                # First DataFrame, just set variable
                hasFirstYelpData = True
                yelpData_df = zipCodeYelpData_df

            else:
                # Merge DataFrames together; have same schema
                yelpData_df = pd.concat([zipCodeYelpData_df, yelpData_df])
        

#-- Summary of Data Gathering Information
print(f"total zip code files {counter}")    
print(f"total zip codes with no businesses {len(emptyZipcodes)}")  


#-- Preview DataFrame
yelpData_df.head()

total zip code files 12
total zip codes with no businesses 8


Unnamed: 0.1,Unnamed: 0,ID,Name,Zipcode,Latitude,Longitude,Price,Rating,IsGlutenFree
0,0,1txVrGkHxmx4dpeF60--wg,Silver Gulch Brewing & Bottling,99502,61.173853,-149.982815,$$,3.0,1
1,1,itP59kI4e9smEs6LXJZsrQ,Humpy's Great Alaskan Alehouse,99502,61.172754,-149.983787,$$,3.0,1
2,2,gBIvK58yPn-L_9yrpf66zA,Sushi On The Fly,99502,61.173658,-149.981177,$$,4.0,1
3,3,fhKgQ9TCVy28egaP32TTVg,AdoboGrill N Jeepney,99502,61.142296,-149.950333,$$,3.5,1
4,4,kjUtoPOnTY6iHMmXJOrnDw,Pork and Pickle,99502,61.173859,-149.982322,$$,2.0,1


In [21]:
def calculateSummaryForZipcode(zipCode, yelpDataForZip_df, results):
    ''' Updates the dictionary with the summary information for the zipcode provided
    
    Accepts : zipCode (str) zipcode for the dataframe provided
              yelpDataForZip_df (DataFrame) only contains data for the zipcode
              results (dictionary) contains the results for the new columns
                'Zipcode' (str) name of the zipcode
                'GF_Total' (num) total number of businesses
                'GF_Price_1' (num) total with price of '$'
                'GF_Price_2' (num) total with price of '$$'
                'GF_Price_3' (num) total with price of '$$$'
                'GF_Price_4' (num) total with price of '$$$$'
                'GF_Price_0' (num) total with price of 'NA'
                'GF_Rating_10' (num) total with range of 1.0
                'GF_Rating_15' (num) total with range of 1.5
                'GF_Rating_20' (num) total with range of 2.0
                'GF_Rating_25' (num) total with range of 2.5
                'GF_Rating_30' (num) total with range of 3.0
                'GF_Rating_35' (num) total with range of 3.5
                'GF_Rating_40' (num) total with range of 4.0
                'GF_Rating_45' (num) total with range of 4.5
                'GF_Rating_50' (num) total with range of 5.0
                'GF_Price_Mean' (num) average for price
                'GF_Price_Std' (num) standard deviation for price
                'GF_Rating_Mean' (num) average for rating
                'GF_Rating_Std' (num) standard deviation for rating
                'ALL_Total' (num) total number of businesses
                'ALL_Price_1' (num) total with price of '$'
                'ALL_Price_2' (num) total with price of '$$'
                'ALL_Price_3' (num) total with price of '$$$'
                'ALL_Price_4' (num) total with price of '$$$$'
                'ALL_Price_0' (num) total with price of 'NA'
                'ALL_Rating_10' (num) total with range of 1.0
                'ALL_Rating_15' (num) total with range of 1.5
                'ALL_Rating_20' (num) total with range of 2.0
                'ALL_Rating_25' (num) total with range of 2.5
                'ALL_Rating_30' (num) total with range of 3.0
                'ALL_Rating_35' (num) total with range of 3.5
                'ALL_Rating_40' (num) total with range of 4.0
                'ALL_Rating_45' (num) total with range of 4.5
                'ALL_Rating_50' (num) total with range of 5.0
                'ALL_Price_Mean' (num) average for price
                'ALL_Price_Std' (num) standard deviation for price
                'ALL_Rating_Mean' (num) average for rating
                'ALL_Rating_Std' (num) standard deviation for rating
                               
    Returns : results (dictionary) same object that is passed in except with another row of data          
    '''
    
    #- General Info
    results['Zipcode'].append(zipCode)
    
    
    #- Group by IsGlutenFree
    hasGlutenFree = False
    
    grouped = yelpDataForZip_df.groupby('IsGlutenFree')
    
    for groupName, group_df in grouped:
        
        #- Determine Group
        searchPrefix = "ALL_"
        
        if (groupName == 0):
            searchPrefix = "GF_"
            hasGlutenFree = True
        
        
        #- Total
        results[f'{searchPrefix}Total'].append(yelpDataForZip_df.shape[0])
    
    
        #- Price
        results[f'{searchPrefix}Price_1'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == '$'].shape[0])
        results[f'{searchPrefix}Price_2'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == '$$'].shape[0])
        results[f'{searchPrefix}Price_3'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == '$$$'].shape[0])
        results[f'{searchPrefix}Price_4'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == '$$$$'].shape[0])
        results[f'{searchPrefix}Price_0'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == 'NA'].shape[0])
    
    
        #- Rating
        results[f'{searchPrefix}Rating_10'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 1.0].shape[0])
        results[f'{searchPrefix}Rating_15'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 1.5].shape[0])
        results[f'{searchPrefix}Rating_20'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 2.0].shape[0])
        results[f'{searchPrefix}Rating_25'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 2.5].shape[0])
        results[f'{searchPrefix}Rating_30'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 3.0].shape[0])
        results[f'{searchPrefix}Rating_35'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 3.5].shape[0])
        results[f'{searchPrefix}Rating_40'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 4.0].shape[0])
        results[f'{searchPrefix}Rating_45'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 4.5].shape[0])
        results[f'{searchPrefix}Rating_50'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 5.0].shape[0])
    
    
        #- Averages
        results[f'{searchPrefix}Price_Mean'].append(yelpDataForZip_df['PriceNum'].mean())
        results[f'{searchPrefix}Rating_Mean'].append(yelpDataForZip_df['Rating'].mean())


        #- Standard Deviation
        results[f'{searchPrefix}Price_Std'].append(yelpDataForZip_df['PriceNum'].std())
        results[f'{searchPrefix}Rating_Std'].append(yelpDataForZip_df['Rating'].std())
    
    
    #- No Gluten Free
    if (hasGlutenFree == False):
        results['GF_Total'].append(0)
        results['GF_Price_1'].append(0)
        results['GF_Price_2'].append(0)
        results['GF_Price_3'].append(0)
        results['GF_Price_4'].append(0)
        results['GF_Price_0'].append(0)
        results['GF_Rating_10'].append(0)
        results['GF_Rating_15'].append(0)
        results['GF_Rating_20'].append(0)
        results['GF_Rating_25'].append(0)
        results['GF_Rating_30'].append(0)
        results['GF_Rating_35'].append(0)
        results['GF_Rating_40'].append(0)
        results['GF_Rating_45'].append(0)
        results['GF_Rating_50'].append(0)
        results['GF_Price_Mean'].append(0)
        results['GF_Rating_Mean'].append(0)
        results['GF_Price_Std'].append(0)
        results['GF_Rating_Std'].append(0)

    
    return results
    

In [4]:
def calculatePrice(row):
    ''' Converts the Yelp text of dollar signs into numeric value
    
    Accepts : row individual row from DataFrame; has "Price" column 
    
    Returns : (num) numeric value that is converted from text value
    '''
    
    value = 0.0
    
    if (row['Price'] == '$'):
        value = 1
        
    elif (row['Price'] == '$$'):
        value = 2
        
    elif (row['Price'] == '$$$'):
        value = 3
        
    elif (row['Price'] == '$$$$'):
        value = 4
        
    else:
        value = math.nan
        
    
    return value
    

Unnamed: 0.1,Unnamed: 0,ID,Name,Zipcode,Latitude,Longitude,Price,Rating,IsGlutenFree
0,0,1txVrGkHxmx4dpeF60--wg,Silver Gulch Brewing & Bottling,99502,61.173853,-149.982815,$$,3.0,1
1,1,itP59kI4e9smEs6LXJZsrQ,Humpy's Great Alaskan Alehouse,99502,61.172754,-149.983787,$$,3.0,1
2,2,gBIvK58yPn-L_9yrpf66zA,Sushi On The Fly,99502,61.173658,-149.981177,$$,4.0,1
3,3,fhKgQ9TCVy28egaP32TTVg,AdoboGrill N Jeepney,99502,61.142296,-149.950333,$$,3.5,1
4,4,kjUtoPOnTY6iHMmXJOrnDw,Pork and Pickle,99502,61.173859,-149.982322,$$,2.0,1


In [25]:
#-- Summarize Data Based on Zipcode

#- Prepare DataFrame: Numeric Price
yelpData_df['PriceNum'] = yelpData_df.apply(lambda row: calculatePrice(row), axis=1)


#- Create Results Container
results = {
    'Zipcode': [],
    'GF_Total' : [],
    'GF_Price_1': [],
    'GF_Price_2': [],
    'GF_Price_3': [],
    'GF_Price_4': [],
    'GF_Price_0': [],
    'GF_Rating_10': [],
    'GF_Rating_15': [],
    'GF_Rating_20': [],
    'GF_Rating_25': [],
    'GF_Rating_30': [],
    'GF_Rating_35': [],
    'GF_Rating_40': [],
    'GF_Rating_45': [],
    'GF_Rating_50': [],
    'GF_Price_Mean': [],
    'GF_Rating_Mean': [],
    'GF_Price_Std': [],
    'GF_Rating_Std': [],
    'ALL_Total' : [],
    'ALL_Price_1': [],
    'ALL_Price_2': [],
    'ALL_Price_3': [],
    'ALL_Price_4': [],
    'ALL_Price_0': [],
    'ALL_Rating_10': [],
    'ALL_Rating_15': [],
    'ALL_Rating_20': [],
    'ALL_Rating_25': [],
    'ALL_Rating_30': [],
    'ALL_Rating_35': [],
    'ALL_Rating_40': [],
    'ALL_Rating_45': [],
    'ALL_Rating_50': [],
    'ALL_Price_Mean': [],
    'ALL_Rating_Mean': [],
    'ALL_Price_Std': [],
    'ALL_Rating_Std': [],
    }

#- Group by Zipcode
zipcodeYelpData_GroupBy = yelpData_df.groupby('Zipcode')


#- Summarize for each Zipcode
for groupName, groupedYelpData_df in zipcodeYelpData_GroupBy:
    results = calculateSummaryForZipcode(groupName, groupedYelpData_df, results)
      
    
#- Add Zero Zipcodes
for zipCode in emptyZipcodes:
    results['Zipcode'].append(zipCode)
    results['GF_Total'].append(0)
    results['GF_Price_1'].append(0)
    results['GF_Price_2'].append(0)
    results['GF_Price_3'].append(0)
    results['GF_Price_4'].append(0)
    results['GF_Price_0'].append(0)
    results['GF_Rating_10'].append(0)
    results['GF_Rating_15'].append(0)
    results['GF_Rating_20'].append(0)
    results['GF_Rating_25'].append(0)
    results['GF_Rating_30'].append(0)
    results['GF_Rating_35'].append(0)
    results['GF_Rating_40'].append(0)
    results['GF_Rating_45'].append(0)
    results['GF_Rating_50'].append(0)
    results['GF_Price_Mean'].append(0)
    results['GF_Rating_Mean'].append(0)
    results['GF_Price_Std'].append(0)
    results['GF_Rating_Std'].append(0)
    results['ALL_Total'].append(0)
    results['ALL_Price_1'].append(0)
    results['ALL_Price_2'].append(0)
    results['ALL_Price_3'].append(0)
    results['ALL_Price_4'].append(0)
    results['ALL_Price_0'].append(0)
    results['ALL_Rating_10'].append(0)
    results['ALL_Rating_15'].append(0)
    results['ALL_Rating_20'].append(0)
    results['ALL_Rating_25'].append(0)
    results['ALL_Rating_30'].append(0)
    results['ALL_Rating_35'].append(0)
    results['ALL_Rating_40'].append(0)
    results['ALL_Rating_45'].append(0)
    results['ALL_Rating_50'].append(0)
    results['ALL_Price_Mean'].append(0)
    results['ALL_Rating_Mean'].append(0)
    results['ALL_Price_Std'].append(0)
    results['ALL_Rating_Std'].append(0)

    
#- Create DataFrame
summarizedYelpData_df = pd.DataFrame(results)


#- Preview DataFrame
pd.set_option('display.max_columns', 500)
summarizedYelpData_df.head()

Unnamed: 0,Zipcode,GF_Total,GF_Price_1,GF_Price_2,GF_Price_3,GF_Price_4,GF_Price_0,GF_Rating_10,GF_Rating_15,GF_Rating_20,GF_Rating_25,GF_Rating_30,GF_Rating_35,GF_Rating_40,GF_Rating_45,GF_Rating_50,GF_Price_Mean,GF_Rating_Mean,GF_Price_Std,GF_Rating_Std,ALL_Total,ALL_Price_1,ALL_Price_2,ALL_Price_3,ALL_Price_4,ALL_Price_0,ALL_Rating_10,ALL_Rating_15,ALL_Rating_20,ALL_Rating_25,ALL_Rating_30,ALL_Rating_35,ALL_Rating_40,ALL_Rating_45,ALL_Rating_50,ALL_Price_Mean,ALL_Rating_Mean,ALL_Price_Std,ALL_Rating_Std
0,20064,6,2,2,0,0,0,0,0,2,0,2,2,0,0,0,1.5,2.833333,0.57735,0.68313,6,2,2,0,0,0,0,0,2,0,2,2,0,0,0,1.5,2.833333,0.57735,0.68313
1,44875,32,8,6,0,0,0,2,0,0,0,0,4,8,12,6,1.428571,4.125,0.513553,0.941858,32,8,6,0,0,0,2,0,0,0,0,4,8,12,6,1.428571,4.125,0.513553,0.941858
2,97442,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,,4.0,,0.0,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,,4.0,,0.0
3,99502,20,2,16,0,0,0,0,0,6,4,4,2,2,0,2,1.888889,2.95,0.323381,0.958343,20,2,16,0,0,0,0,0,6,4,4,2,2,0,2,1.888889,2.95,0.323381,0.958343
4,76036,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [7]:
#-- Save to Disk

#- Complete Yelp Data
completeYelpDataPath = os.path.join('.', 'Output_2', 'AllZipcodeYelpData.csv')

yelpData_df.to_csv(completeYelpDataPath)


#- Summarized Yelp Data
summarizeYelpDataPath = os.path.join('.', 'Output_2', 'SummarizedYelpData.csv')

summarizedYelpData_df.to_csv(summarizeYelpDataPath)


#- Missing Zipcodes
missingZipcodesPath = os.path.join('.', 'Output_2', 'MissingZipcodesYelpData.csv')

missingZipcodes = {
    'Zipcodes': emptyZipcodes
    }

missingZipcodes_df = pd.DataFrame(missingZipcodes)

missingZipcodes_df.to_csv(missingZipcodesPath)

print('Completed saving dataframes to disk')

Completed saving dataframes to disk
