In [1]:
import numpy as np
import pandas as pd

In [2]:
import json
import re
from tqdm import tqdm_notebook as tqdm

In [3]:
import sys
sys.path.insert(1, '../tools/')
import os

In [4]:
from yelp_fusion import *

In [5]:
file_names = os.listdir('../data/raw_businesses_data/')

In [6]:
def is_json_file(in_str):
    businesses_info = re.findall(r'businesses_info.json$', in_str)
    reviews = re.findall(r'reviews.json$', in_str)
    if businesses_info:
        return 'businesses_info'
    elif reviews:
        return 'reviews'

In [7]:
file_businesses_info = [i for i in file_names if is_json_file(i) == 'businesses_info']
file_reviews = [i for i in file_names if is_json_file(i) == 'reviews']

# All Businesses

In [8]:
def load_json_to_df_businesses(file_name):
    with open('../data/raw_businesses_data/{}'.format(file_name), 'r') as file:
        data = json.load(file)

    return pd.DataFrame(data['businesses'])

In [9]:
all_businesses_info = [load_json_to_df_businesses(i) for i in tqdm(file_businesses_info)]

HBox(children=(IntProgress(value=0), HTML(value='')))




In [183]:
all_businesses_df = pd.concat(all_businesses_info, sort=False).reset_index(drop=True)

In [184]:
def balance_list(in_list, n):
    """ balance a list to length n
    
    Parameters: in_list (tuple/list/array like) - input list
                n (int) - the length to balance
                
    Returns: (list) - the balanced list
    """
    my_list = list(in_list)
    list_len = len(my_list)
    if list_len <= n:
        append_n = n - list_len
        return in_list + [None]*append_n
    else:
        return my_list

In [185]:
def multi_len_list_to_df(in_arr, col_name):
    """ balance muti length list of list to a DataFrame
    
    Parameters: in_arr (list/array like) - a list of list with different length
                col_name (str) - the name of the column, will name it as col_name_n
                                 for n is the longest length list
                                 
    Returns: df (DataFrame) - a balanced dataframe
    """
    unique = np.unique(in_arr)
    max_len = max([len(i) for i in unique])
    col = ['{}{}'.format(col_name, i) for i in range(1, max_len+1)]
    df = pd.DataFrame([balance_list(i, max_len) for i in in_arr], columns=col)
    
    return df

## Transactions

In [186]:
transactions = multi_len_list_to_df(all_businesses_df['transactions'], 'transactions')

## Coordinates

In [187]:
raw_coordinates = all_businesses_df['coordinates'].values

In [188]:
col_cor = ['latitude','longitude']
coordinates = pd.DataFrame([(i['latitude'], i['longitude']) for i in all_businesses_df['coordinates'].values], columns=col_cor)

## Categories

In [189]:
raw_categories = all_businesses_df['categories']

In [190]:
in_list = raw_categories[10]

In [191]:
alias = [[i['alias'] for i in in_list] for in_list in raw_categories]
title = [[i['title'] for i in in_list] for in_list in raw_categories]

In [192]:
alias_df = multi_len_list_to_df(alias, 'alias')
title_df = multi_len_list_to_df(title, 'title')

In [193]:
categories = pd.concat([alias_df, title_df], axis=1)

## Location

In [194]:
raw_location = all_businesses_df['location']

In [195]:
display_address = [', '.join(i['display_address']) for i in raw_location]

In [196]:
col_location = ['address1','address2','address3','city','zip_code','country','state']
location = pd.DataFrame({i:[in_dict[i] for in_dict in raw_location] for i in col_location})
location['display_address'] = display_address

In [197]:
fix_empty_str = lambda in_str: None if in_str == '' else in_str
for i in list(location):
    location[i] = [fix_empty_str(i) for i in location[i]]

## Combine

In [198]:
problem_col = ['categories','coordinates','transactions','location']

In [200]:
combine_df_name = [all_businesses_df, coordinates, categories, transactions, location]
all_businesses_df = pd.concat(combine_df_name, axis=1).drop(columns=problem_col)

In [206]:
for i in list(all_businesses_df):
    all_businesses_df[i] = [fix_empty_str(i) for i in all_businesses_df[i]]

In [208]:
# all_businesses_df.to_csv('../data/all_businesses_info.csv', index=False)

# All Reviews

In [211]:
def load_json_to_df_reviews(file_name):
    with open('../data/raw_businesses_data/{}'.format(file_name), 'r') as file:
        data = json.load(file)

    return dict_to_df(data['reviews'])

In [212]:
all_reviews = [load_json_to_df_reviews(i) for i in tqdm(file_reviews)]

HBox(children=(IntProgress(value=0), HTML(value='')))




In [213]:
all_reviews_df = pd.concat(all_reviews, sort=False).reset_index(drop=True)

In [214]:
rename_col = {'id':'user_id', 'profile_url':'user_profile_url', 'image_url':'user_image_url', 'name':'user_name'}
users_df = pd.DataFrame(all_reviews_df.user.values.tolist()).rename(columns=rename_col)

In [215]:
all_reviews_df = pd.concat([all_reviews_df, users_df], axis=1).drop(columns='user')

In [217]:
# all_reviews_df.to_csv('../data/all_reviews.csv', index=False)