In [1]:
import numpy as np
import pandas as pd
from collections import Counter 
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import scipy
import json
import math


#function to load json file annd filter only bisiness related to Restaurants and Food
def loadBusinessData(path):
    
    business= []
             
    with open(path, encoding='utf-8') as fin:
        i = 0 
        for line in fin:
            line_contents = json.loads(line)
            categories = line_contents['categories']
            try:
                if 'Restaurants' in categories or 'Food' in categories:
                    business.append(line_contents)
            except TypeError:
                pass
        
    return business



def columnCreationForRestaurants(business):
    
    column_names = business[1].keys()
    for item in business:   
        if len([x for x in item.keys() if x in column_names]) != len(item.keys()):
            print(item.keys())
    
    
    data = []
    for column in column_names:
        this_col = []
        for item in business:
            this_col.append(item[column])
        data.append(this_col)

    
    first_cols = list(column_names)[0:10]
    pd_data ={}
    idx = 0
    for column in first_cols:
        pd_data[column] = data[idx]
        idx+=1
    business = pd.DataFrame(pd_data)
    
    business['category'] = data[12]
    
    return (business,data )
    

def get_all_column_names(data):
    column_names = set()
    
    for line_contents in data:
        column_names.update(
                set(get_category_column_names(line_contents))
                )
    return column_names



def get_category_column_names(line_contents):
    column_names = []
    if line_contents is None:
        return ''
    for k in line_contents.keys():
        column_names.append('{0}'.format(k)
        )
    
    return column_names
    

def addColumns(nested_columns,business,data):
    # making all columns 0
    for col in nested_columns:
        business[col]= np.nan
    #looping through each attribute
    idx = 0
    for attr in data[11]:
        if attr is None:
            idx+=1
            continue
        for k,v in attr.items():
            if'{' in v:
                continue
            business[k][idx] = v
        idx+=1

    return business  
        
# function to filter restaurant business data only for California CA
def businessDataCleaning(business):
    
    business = business[business.state == 'CA']
    
    perc = 5.0
    min_count =  int(((perc)/100)*business.shape[0] + 1)
    business = business.dropna( axis=1, thresh=min_count)
    business = business.reset_index(drop=True)
    

    business.to_csv('Clean_data_files/business.csv')
    print("\n===================Printing the columns and their datatype of business dataset==============\n")
    
    print(business.info())
    
    print("\n===================Printing the structure of business dataset rows X col====================\n")
    print(business.shape)
    
    print("\n================Printing the first 5 rows of the business data set=============\n")
    print(business.head())
    
    print('\n\n\n')
    
    return business


#function to get the category frequencies
def get_frequencies_and_add_to_dataset():
    
    business_new = pd.read_csv('Clean_data_files/business.csv')
 
    cat_frequency = []
    num_tags = []
    
    print(business_new.head())
    
    for i in range(business_new['category'].shape[0]):
        each_line = business_new['category'][i].split(", ")

        for content in each_line:
            cat_frequency.append(content)
            num_tags.append(len(content))
            

    #count the categories whose frequencies are over 10
    new_feature = Counter(cat_frequency).most_common(63)

    feature = pd.DataFrame()
    for ind,val in enumerate(new_feature[2:]):
        feature.loc[ind,'Feature'] = val[0]
        feature.loc[ind,'IDF'] =  math.log1p(len(business_new.business_id) / val[1])
        
    data_business2 = business_new


    for ind in range(feature.shape[0]):
        ft = feature.loc[ind,'Feature']
        for i in range(business_new.shape[0]):
            each_line = business_new['category'][i].split(", ")
            if ft in each_line:
                business_new.loc[i,ft] = 1 
            else:
                business_new.loc[i,ft] = 0   
    
    business_new.drop('Unnamed: 0', axis=1, inplace=True)
    business_new.to_csv('Clean_data_files/business_categorized.csv')
    print("\n===============Printing the structure of the categorized business dataset rows X col================\n")
    print(business_new.shape)
    
    print("\n=================Printing the first 5 rows of the new categorized business data=============\n")
    print(business_new.head()) 
    print('\n\n\n')
    

    
def main():
    
#   using yelp dataset json data file from local path
    path ='../Raw_json_files/yelp_academic_dataset_business.json'
    
    business = loadBusinessData(path)
    
    business,data =columnCreationForRestaurants(business)
    
    nested_columns = get_all_column_names(data[11])
    
    business = addColumns(nested_columns,business,data)
    
    business = businessDataCleaning(business)
    
    
    get_frequencies_and_add_to_dataset()
    
if __name__ == "__main__":
    main()
    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1596 entries, 0 to 1595
Data columns (total 39 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   business_id                 1596 non-null   object 
 1   name                        1596 non-null   object 
 2   address                     1596 non-null   object 
 3   city                        1596 non-null   object 
 4   state                       1596 non-null   object 
 5   postal_code                 1596 non-null   object 
 6   latitude                    1596 non-null   float64
 7   longitude                   1596 non-null   float64
 8   stars                       1596 non-null   float64
 9   review_count                1596 non-null   int64  
 10  category                    1596 non-null   object 
 11  BusinessAcceptsBitcoin      286 non-null    object 
 12  RestaurantsDelivery         1235 non-null   object 
 13  BusinessAcceptsCreditCards  150