# Importing Any Necessary Packages 

In [1]:
import json 
from pprint import pprint

import pandas as pd
import matplotlib.pyplot as plt
import graphviz

import numpy as np
from pandas.io.json import json_normalize

import math

# Extraction of Data 

In [2]:
def extractDataInfo(filename):
    data = []
    attributes = []
    returnValues = []
    with open(filename) as f:
        for line in f:
            dataLine = json.loads(line)
            data.append(dataLine)
            for key in dataLine.keys():
                attributes.append(key)
        uniqueAttributes = set(attributes)
        returnValues.append(data)
        returnValues.append(uniqueAttributes)
        return returnValues

In [3]:
businessInfo = extractDataInfo('../dataset/business.json') #businessInfo is a list

In [4]:
businessDf = pd.DataFrame.from_dict(businessInfo[0])

In [5]:
originalDf = pd.DataFrame.from_dict(businessInfo[0]) #the original dataset 

# Basic Summary of Raw Business Data

In [6]:
print(businessInfo[0][0])

{'business_id': 'YDf95gJZaq05wvo7hTQbbQ', 'name': 'Richmond Town Square', 'neighborhood': '', 'address': '691 Richmond Rd', 'city': 'Richmond Heights', 'state': 'OH', 'postal_code': '44143', 'latitude': 41.5417162, 'longitude': -81.4931165, 'stars': 2.0, 'review_count': 17, 'is_open': 1, 'attributes': {'RestaurantsPriceRange2': 2, 'BusinessParking': {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}, 'BikeParking': True, 'WheelchairAccessible': True}, 'categories': ['Shopping', 'Shopping Centers'], 'hours': {'Monday': '10:00-21:00', 'Tuesday': '10:00-21:00', 'Friday': '10:00-21:00', 'Wednesday': '10:00-21:00', 'Thursday': '10:00-21:00', 'Sunday': '11:00-18:00', 'Saturday': '10:00-21:00'}}


In [7]:
businessDf.shape

(156639, 15)

In [8]:
businessDf.dtypes

address          object
attributes       object
business_id      object
categories       object
city             object
hours            object
is_open           int64
latitude        float64
longitude       float64
name             object
neighborhood     object
postal_code      object
review_count      int64
stars           float64
state            object
dtype: object

A brief description of what the variable contains (i.e. "The Tuition column contains information on the annual cost of tuition in $USD"): 
* "business_id":string, 22 character unique string business id
* "name":string, the business's name
* "neighborhood": string, the neighborhood's name
* "address": string, the full address of the business
* "city": string, the city
* "State": string, 2 character state code, if applicable
* "postal code": string, the postal code
* "latitude": float, latitude
* "longitude": float, longitude
* "stars": float, star rating, rounded to half-stars
* "review_count": interger, number of reviews
* "is_open": integer, 0 or 1 for closed or open, respectively
* "attributes": object, business attributes to values. note: some attribute values might be objects
* "categories": an array of strings of business categories
* "hours":an object of key day to value hours, hours are using a 24hr clock
* More description at https://www.yelp.com/dataset/documentation/json

In [9]:
businessDf.describe()

Unnamed: 0,is_open,latitude,longitude,review_count,stars
count,156639.0,156638.0,156638.0,156639.0,156639.0
mean,0.844375,38.585033,-92.856485,30.238159,3.647154
std,0.362501,5.399871,26.557741,96.486631,0.97764
min,0.0,-36.086009,-142.46665,3.0,1.0
25%,1.0,33.627161,-112.138207,4.0,3.0
50%,1.0,36.142381,-89.523198,9.0,3.5
75%,1.0,43.596845,-79.66876,23.0,4.5
max,1.0,89.999314,115.086769,6979.0,5.0


# Expanding DataFrame: Unnesting Jsons