### Yelp Data Summary
The data collection stores one csv file for each zip code and it is placed in the "Output" folder.  The naming convention of these files is "YelpData_<SearchZipCode>.csv".  The code does the following:
    1 Gets a list of the files within the "Output" folder
    2 Loops through the files that start with "YelpData" and does the following:
      2.1 Read the file into Pandas DataFrame
      2.2 Summarize data; get total records
      2.3 Summarize data; get total records the different rating
      2.4 Summarize data; get total records for the different prices
    3 All of the summarized data are stored in DataFrame
    4 Summarized DataFrame is stored on disk; name "SummarizedYelpData.csv"
    5 Create DataFrame with all of the zip code data and save to disk; named "AllZipcodeYelpData.csv"
    

In [1]:
#-- Import Libraries
import pandas as pd
import os

In [66]:
#-- Get List of Files on Disk
outputPath = os.path.join(".", "Output")

files = os.listdir(outputPath)


#-- Load into Single DataFrame
yelpData_df = None
hasFirstYelpData = False
emptyZipcodes = []
counter = 0

for file in files:
    
    if (file.startswith("YelpData_") == True):
        
        counter += 1
        
        
        #- Get DataFrame
        filePath = os.path.join(".", "Output", file)
        
        zipCodeYelpData_df = pd.read_csv(filePath)
        

        #- Check for Businesses
        # Large number of zipcodes do not have any businesses that satisfy the filter, store the zipcode
        # and add to summary dataframe with zeroes
        if (zipCodeYelpData_df.shape[0] == 0):
            if (len(file) == 17):
                emptyZipcodes.append(file[9:13])
            else:
                emptyZipcodes.append(file[9:14])
        
        else:
            
            #- Create One DataFrame
            if (hasFirstYelpData == False):
                # First DataFrame, just set variable
                hasFirstYelpData = True
                yelpData_df = zipCodeYelpData_df

            else:
                # Merge DataFrames together; have same schema
                yelpData_df = pd.concat([zipCodeYelpData_df, yelpData_df])
        

#-- Summary of Data Gathering Information
print(f"total zip code files {counter}")    
print(f"total zip codes with no businesses {len(emptyZipcodes)}")  


#-- Preview DataFrame
yelpData_df.head()

total zip code files 227
total with no businesses 190


Unnamed: 0.1,Unnamed: 0,ID,Name,Zipcode,Latitude,Longitude,Price,Rating
0,0,5Q1r2PLfZKAsSh9Sc0hL4g,Los Tinos Mexican Restaurant,98118,47.53674,-122.26992,$$,4.5
1,1,S1KsT_zM29mlVAgk1WaNzg,Chew-e-Core,98118,47.542515,-122.268997,,5.0
2,2,vcUN3Fg7bBgKkVdeTMUv1g,Café Avole,98118,47.542606,-122.272621,$,4.5
3,3,IsWM7BrvfZgZfZnC22_64w,Island Soul Rum Bar & Soul Shack,98118,47.55812,-122.28544,$$,4.0
4,4,fN0cjzRI7_yQtjKHZGEuLw,Redwing Cafe,98118,47.51956,-122.26139,$,4.5


In [81]:
def calculateSummaryForZipcode(zipCode, yelpDataForZip_df, results):
    ''' Updates the dictionary with the summary information for the zipcode provided
    
    Accepts : zipCode (str) zipcode for the dataframe provided
              yelpDataForZip_df (DataFrame) only contains data for the zipcode
              results (dictionary) contains the results for the new columns
                'Zipcode' (str) name of the zipcode
                'Total' (num) total number of businesses
                'Price_1' (num) total with price of '$'
                'Price_2' (num) total with price of '$$'
                'Price_3' (num) total with price of '$$$'
                'Price_4' (num) total with price of '$$$$'
                'Price_0' (num) total with price of 'NA'
                'Rating_10' (num) total with range of 1.0
                'Rating_15' (num) total with range of 1.5
                'Rating_20' (num) total with range of 2.0
                'Rating_25' (num) total with range of 2.5
                'Rating_30' (num) total with range of 3.0
                'Rating_35' (num) total with range of 3.5
                'Rating_40' (num) total with range of 4.0
                'Rating_45' (num) total with range of 4.5
                'Rating_50' (num) total with range of 5.0
                
    Returns : results (dictionary) same object that is passed in except with another row of data          
    '''
    
    #- General Info
    results['Zipcode'].append(zipCode)
    results['Total'].append(yelpDataForZip_df.shape[0])
    
    
    #- Price
    results['Price_1'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == '$'].shape[0])
    results['Price_2'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == '$$'].shape[0])
    results['Price_3'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == '$$$'].shape[0])
    results['Price_4'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == '$$$$'].shape[0])
    results['Price_0'].append(yelpDataForZip_df[yelpDataForZip_df['Price'] == 'NA'].shape[0])
    
    
    #- Rating
    results['Rating_10'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 1.0].shape[0])
    results['Rating_15'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 1.5].shape[0])
    results['Rating_20'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 2.0].shape[0])
    results['Rating_25'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 2.5].shape[0])
    results['Rating_30'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 3.0].shape[0])
    results['Rating_35'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 3.5].shape[0])
    results['Rating_40'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 4.0].shape[0])
    results['Rating_45'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 4.5].shape[0])
    results['Rating_50'].append(yelpDataForZip_df[yelpDataForZip_df['Rating'] == 5.0].shape[0])
    
    
    return results
    

In [85]:
#-- Summarize Data Based on Zipcode

#- Create Results Container
results = {
    'Zipcode': [],
    'Total' : [],
    'Price_1': [],
    'Price_2': [],
    'Price_3': [],
    'Price_4': [],
    'Price_0': [],
    'Rating_10': [],
    'Rating_15': [],
    'Rating_20': [],
    'Rating_25': [],
    'Rating_30': [],
    'Rating_35': [],
    'Rating_40': [],
    'Rating_45': [],
    'Rating_50': [],
    }

#- Group by Zipcode
zipcodeYelpData_GroupBy = yelpData_df.groupby('Zipcode')


#- Summarize for each Zipcode
for groupName, groupedYelpData_df in zipcodeYelpData_GroupBy:
    
    results = calculateSummaryForZipcode(groupName, groupedYelpData_df, results)
    

#- Add Zero Zipcodes
for zipCode in emptyZipcodes:
    results['Zipcode'].append(zipCode)
    results['Total'].append(0)
    results['Price_1'].append(0)
    results['Price_2'].append(0)
    results['Price_3'].append(0)
    results['Price_4'].append(0)
    results['Price_0'].append(0)
    results['Rating_10'].append(0)
    results['Rating_15'].append(0)
    results['Rating_20'].append(0)
    results['Rating_25'].append(0)
    results['Rating_30'].append(0)
    results['Rating_35'].append(0)
    results['Rating_40'].append(0)
    results['Rating_45'].append(0)
    results['Rating_50'].append(0)
    

#- Create DataFrame
summarizedYelpData_df = pd.DataFrame(results)


#- Preview DataFrame
summarizedYelpData_df.head()

Unnamed: 0,Zipcode,Total,Price_1,Price_2,Price_3,Price_4,Price_0,Rating_10,Rating_15,Rating_20,Rating_25,Rating_30,Rating_35,Rating_40,Rating_45,Rating_50
0,10605,9,2,5,1,0,0,0,0,0,2,1,2,2,2,0
1,11206,54,15,29,3,0,0,0,0,1,2,7,10,16,14,4
2,11364,7,0,7,0,0,0,0,0,1,1,2,2,1,0,0
3,11789,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
4,15701,6,1,5,0,0,0,0,0,0,0,2,2,1,1,0


In [86]:
#-- Save to Disk

#- Complete Yelp Data
completeYelpDataPath = os.path.join('.', 'Output', 'AllZipcodeYelpData.csv')

yelpData_df.to_csv(completeYelpDataPath)


#- Summarized Yelp Data
summarizeYelpDataPath = os.path.join('.', 'Output', 'SummarizedYelpData.csv')

summarizedYelpData_df.to_csv(summarizeYelpDataPath)


#- Missing Zipcodes
missingZipcodesPath = os.path.join('.', 'Output', 'MissingZipcodesYelpData.csv')

missingZipcodes = {
    'Zipcodes': emptyZipcodes
    }

missingZipcodes_df = pd.DataFrame(missingZipcodes)

missingZipcodes_df.to_csv(missingZipcodesPath)

print('Completed saving dataframes to disk')

Completed saving dataframes to disk
