# Importing Any Necessary Packages 

In [1]:
import json 
from pprint import pprint

import pandas as pd
import matplotlib.pyplot as plt
import graphviz

import numpy as np
from pandas.io.json import json_normalize

import math

In [2]:
pd.set_option('display.max_columns', None)

# Extraction of Data 

In [3]:
def extractDataInfo(filename):
    data = []
    attributes = []
    returnValues = []
    with open(filename) as f:
        for line in f:
            dataLine = json.loads(line)
            data.append(dataLine)
            for key in dataLine.keys():
                attributes.append(key)
        uniqueAttributes = set(attributes)
        returnValues.append(data)
        returnValues.append(uniqueAttributes)
        return returnValues

In [4]:
businessInfo = extractDataInfo('../dataset/business.json') #businessInfo is a list

In [5]:
businessDf = pd.DataFrame.from_dict(businessInfo[0])

In [6]:
originalDf = pd.DataFrame.from_dict(businessInfo[0]) #the original dataset 

# Basic Summary of Raw Business Data

In [7]:
print(businessInfo[0][0])

{'business_id': 'YDf95gJZaq05wvo7hTQbbQ', 'name': 'Richmond Town Square', 'neighborhood': '', 'address': '691 Richmond Rd', 'city': 'Richmond Heights', 'state': 'OH', 'postal_code': '44143', 'latitude': 41.5417162, 'longitude': -81.4931165, 'stars': 2.0, 'review_count': 17, 'is_open': 1, 'attributes': {'RestaurantsPriceRange2': 2, 'BusinessParking': {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}, 'BikeParking': True, 'WheelchairAccessible': True}, 'categories': ['Shopping', 'Shopping Centers'], 'hours': {'Monday': '10:00-21:00', 'Tuesday': '10:00-21:00', 'Friday': '10:00-21:00', 'Wednesday': '10:00-21:00', 'Thursday': '10:00-21:00', 'Sunday': '11:00-18:00', 'Saturday': '10:00-21:00'}}


In [8]:
businessDf.shape

(156639, 15)

In [9]:
businessDf.dtypes

address          object
attributes       object
business_id      object
categories       object
city             object
hours            object
is_open           int64
latitude        float64
longitude       float64
name             object
neighborhood     object
postal_code      object
review_count      int64
stars           float64
state            object
dtype: object

A brief description of what the variable contains (i.e. "The Tuition column contains information on the annual cost of tuition in $USD"): 
* "business_id":string, 22 character unique string business id
* "name":string, the business's name
* "neighborhood": string, the neighborhood's name
* "address": string, the full address of the business
* "city": string, the city
* "State": string, 2 character state code, if applicable
* "postal code": string, the postal code
* "latitude": float, latitude
* "longitude": float, longitude
* "stars": float, star rating, rounded to half-stars
* "review_count": interger, number of reviews
* "is_open": integer, 0 or 1 for closed or open, respectively
* "attributes": object, business attributes to values. note: some attribute values might be objects
* "categories": an array of strings of business categories
* "hours":an object of key day to value hours, hours are using a 24hr clock
* More description at https://www.yelp.com/dataset/documentation/json

In [10]:
businessDf.describe()

Unnamed: 0,is_open,latitude,longitude,review_count,stars
count,156639.0,156638.0,156638.0,156639.0,156639.0
mean,0.844375,38.585033,-92.856485,30.238159,3.647154
std,0.362501,5.399871,26.557741,96.486631,0.97764
min,0.0,-36.086009,-142.46665,3.0,1.0
25%,1.0,33.627161,-112.138207,4.0,3.0
50%,1.0,36.142381,-89.523198,9.0,3.5
75%,1.0,43.596845,-79.66876,23.0,4.5
max,1.0,89.999314,115.086769,6979.0,5.0


# Expanding DataFrame: Unnesting 
A lot of our data is nested within each dataframe, particularly, the 'attributes', 'hours', and 'categories' columns

## Part I: Unnesting JSON 
   We decided to unnest the jsons of the attributes and hours info because those could be significant factors  

In [11]:
def unnestJson(dataframe):
    data = list(dataframe)
    return json_normalize(data)

In [32]:
openHoursDf = unnestJson(businessDf['hours']) #The dataframe with unnested open hours info 
attributesDf = unnestJson(businessDf['attributes']) #The dataframe with unnested attributes info 

In [19]:
#merge two dataframes sidebyside by columns 
businessDf = pd.concat([businessDf, openHoursDf, attributesDf], axis=1) 

In [44]:
len(businessDf)

156639

## Part II: Unnesting Categories  
Since the # unique Categories is significantly lower than allCategories, it could be significant to group the categories

In [7]:
def findAllCategories():
    allCategories =[]
    for categoryList in businessDf['categories']:
        if(type(categoryList) != str):  
            for category in categoryList:
                allCategories.append(category)
        else:
            allCategories.append(categoryList)
    return allCategories

In [8]:
def findUniqueCategories(allCategoriesList):
    return set(allCategoriesList)

In [65]:
def categoriesStats(allCategoriesList):
    print('Length of all categories is:')
    print(len(allCategoriesList))
    print('Length of all unique categories is:')
    print(len(findUniqueCategories(allCategoriesList)))
    allCategoriesList.remove('Restaurants')
    allCategoriesSeries = pd.Series(allCategoriesList)
    print(allCategoriesSeries.value_counts())
    
    return allCategoriesSeries.value_counts()

In [66]:
originalAllCategories = findAllCategories()
len(originalAllCategories)

330283

In [67]:
statsInfo = categoriesStats(originalAllCategories)

Length of all categories is:
330283
Length of all unique categories is:
827
Restaurants                  51612
Shopping                     24261
Beauty & Spas                13139
Food                         11538
Home Services                11072
Health & Medical             10850
Automotive                    8457
Local Services                6667
Active Life                   6389
Event Planning & Services     4866
Hair Salons                   4817
Hotels & Travel               4723
Auto Repair                   4379
Doctors                       3859
Nightlife                     3830
Nail Salons                   3750
Real Estate                   3631
Fitness & Instruction         3420
Arts & Entertainment          3257
Professional Services         3235
Pets                          3039
Bars                          2753
Dentists                      2674
Hair Removal                  2530
Hotels                        2271
Skin Care                     2134
Pet Services  

In [114]:
def getTopCategories(topNum, frequencyTable):
    statsInfoDict = frequencyTable.to_dict()
    statsInfoList = sorted(statsInfoDict.items(), key=lambda x: x[1], reverse=True)
    
    topCategories =[]
    for i in range(topNum):
        topCategories.append(statsInfoList[i][0])
    
    return topCategories

In [37]:
def reGroup(df, categoryName):
    existList = []
    for categoryArr in df:
        exist = False
        if(type(categoryArr) != str):  
            for category in categoryArr:
                if (category == categoryName):
                    exist = True
        existList.append(exist)
    
    return existList 

In [38]:
def replaceCategory(df, columnName, replacement):
    categoriesDf = df[columnName]
    restaurantsExistList = reGroup(categoriesDf, replacement)
    restaurantsExistSeries = pd.Series(restaurantsExistList)
    categoriesDf.loc[restaurantsExistSeries] = replacement
    print('# of ' + replacement + " made:")
    print(len(categoriesDf.loc[categoriesDf == replacement]))

In [116]:
topCategoriesList = getTopCategories(20, statsInfo)
topCategoriesList

['Restaurants',
 'Shopping',
 'Beauty & Spas',
 'Food',
 'Home Services',
 'Health & Medical',
 'Automotive',
 'Local Services',
 'Active Life',
 'Event Planning & Services',
 'Hair Salons',
 'Hotels & Travel',
 'Auto Repair',
 'Doctors',
 'Nightlife',
 'Nail Salons',
 'Real Estate',
 'Fitness & Instruction',
 'Arts & Entertainment',
 'Professional Services']

In [63]:
def convertedCategories(df, columnName, dataType):
    isStringList =[]
    dataFrame = df[columnName]
    for observation in dataFrame:
        isString = False
        if(type(observation) == dataType):  
            isString = True
        isStringList.append(isString)
    
    isStringSeries = pd.Series(isStringList)
    print("# converted...")
    converted =len(dataFrame.loc[isStringSeries])
    print(converted)
    print("# unconverted..")
    print(len(df) - converted)

In [117]:
for topCategory in topCategoriesList:
    replaceCategory(businessDf, 'categories', topCategory)
    convertedCategories(businessDf, 'categories', str)
    print("--------------- conversion ended -----------------")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


# of Restaurants made:
51613
# converted...
120385
# unconverted..
36254
# of Shopping made:
24261
# converted...
120385
# unconverted..
36254
# of Beauty & Spas made:
13139
# converted...
120385
# unconverted..
36254
# of Food made:
11538
# converted...
120385
# unconverted..
36254
# of Home Services made:
11053
# converted...
120385
# unconverted..
36254
# of Health & Medical made:
8781
# converted...
120385
# unconverted..
36254
# of Automotive made:
8040
# converted...
128425
# unconverted..
28214
# of Local Services made:
4648
# converted...
133073
# unconverted..
23566
# of Active Life made:
5396
# converted...
138469
# unconverted..
18170
# of Event Planning & Services made:
4002
# converted...
142471
# unconverted..
14168
# of Hair Salons made:
0
# converted...
142471
# unconverted..
14168
# of Hotels & Travel made:
1700
# converted...
144171
# unconverted..
12468
# of Auto Repair made:
0
# converted...
144171
# unconverted..
12468
# of Doctors made:
0
# converted...
144171
# u