In [1]:
import pandas as pd


#test = open("data/sun city/business.json")
#tested = pd.read_json(test, lines = True)

In [2]:
def load_business(database):
    """
    Convert a business.json file to a dataframe
    """
    db_open = open(database)
    business = pd.read_json(db_open, lines = True)
    
    return business

In [3]:
def remove_low_stars_business(business, stars):
    """
    Remove all businesses rated below a given value from the business dataframe
    """
    disliked = business[business['stars'] < stars].index
    business.drop(disliked, inplace = True)
    
    return business

In [4]:
def remove_few_ratings_business(business, number):
    """
    Remove all businesses with fewer ratings than a given value from the business dataframe
    """
    too_low = business[business['review_count'] < number].index
    business.drop(too_low, inplace = True)
    
    return business

In [5]:
def remove_irrelevant_columns_business(business):
    """
    Remove all unused columns from the business dataframe, for efficiency
    """
    relevant = business.reindex(columns = ['attributes', 'business_id', 'categories', 'stars', 'review_count'])
    
    return relevant

In [6]:
def prep_business(business):
    """
    Make a business.json file into a dataframe with only the desired data
    """
    df_1 = load_business(business)    
    df_2 = remove_few_ratings_business(df_1, 5)
    df_3 = remove_low_stars_business(df_2, 2)
    df_4 = remove_irrelevant_columns_business(df_3)
    
    return df_4
    

exemple_business = prep_business("data/sun city/business.json")    
exemple_business.head()

Unnamed: 0,attributes,business_id,categories,stars,review_count
0,"{'RestaurantsTakeOut': 'True', 'RestaurantsRes...",ol2r325YnfuHcq7yeO3vdg,"Sandwiches, Fast Food, Restaurants",2.0,13
3,"{'BusinessAcceptsCreditCards': 'True', 'Busine...",rIhu4bA2CmSicOSJmTpGug,"Restaurants, Food, Bakeries",4.0,143
4,{'BusinessAcceptsCreditCards': 'True'},2y2d1wJrZAhHI2J7o2ndtg,"Venues & Event Spaces, Local Services, Event P...",4.5,6
6,"{'WiFi': 'u'no'', 'RestaurantsTakeOut': 'True'...",8EIZAb0knT2u_z2gIxllXw,"Restaurants, Mexican, Fast Food",2.5,43
7,"{'BusinessAcceptsCreditCards': 'True', 'Busine...",P8AV927sZLwWzbTtEY-apA,"Contractors, Home Services, Home & Garden, Sho...",5.0,5


In [7]:
def load_review(database):
    """
    Convert a review.json file to a dataframe
    """
    db_open = open(database)
    review = pd.read_json(db_open, lines = True)
    
    return review

#load_review("data/sun city/review.json")

In [8]:
def remove_irrelevant_columns_review(review):
    """
    Remove all unused columns from the review dataframe, for efficiency
    """
    relevant = review.reindex(columns = ['business_id', 'review_id', 'stars', 'user_id'])
    
    return relevant

show_me = remove_irrelevant_columns_review(load_review("data/sun city/review.json"))
show_me.head()

Unnamed: 0,business_id,review_id,stars,user_id
0,PNzir9TtJAD7U41GwR98-w,IufXUZcwGabRNyRvnFJNzg,5,DAIpUGIsY71noX0wNuc27w
1,PNzir9TtJAD7U41GwR98-w,tVENWPb-IK36kWtzdSawuQ,1,oDexYEBOZEPBGJ5_e48lZw
2,TL-D_z9E1eKJQqSioemFfw,F-juQLpB-Z5iUypfgNMvrg,5,yTZC-agO3gknzsL1A7LOIQ
3,rIhu4bA2CmSicOSJmTpGug,wndb4QQ5vWF40z84iWPFvQ,3,Z1wf19FVzvR57O0T-CdBYw
4,S9BSFX03TBqAHFF1M4c08g,UXok8HepAp9Uh1NEDvboDg,5,HX4IwDr-uvlXVE21lXbrjg


In [9]:
def user_review_matrix(user):
    """
    Create a dataframe with user_id as the index and the number of user reviews as the feature
    """
    # open the user.json as a dataframe for use
    db_open = open(user)
    user_db = pd.read_json(db_open, lines = True)    
        
    # set the user_id to be the index
    user_review_nr = user_db.set_index('user_id')
    
    # remove all irrelevant data
    user_review_nr = user_review_nr.reindex(columns = ['review_count'])
    
    return user_review_nr

show_user_review_count = user_review_matrix("data/sun city/user.json")
show_user_review_count.head()

Unnamed: 0_level_0,review_count
user_id,Unnamed: 1_level_1
MM4RJAeH6yuaN8oZDSt0RA,361
UG4EKu13JRwzRix6ESINdg,1083
HOKngwcMBbb8Sa-jl4ZzJw,127
6iNK8f0Y10Uy-bZjkGYU5Q,63
8SSaCgmvsztEOg2CqeXxnw,509


In [10]:
def business_review_matrix(business):
    """
    Create a series with business_id the index and the number of user reviews as the features
    """
    
    
    # open the business.json as a dataframe for use
    db_open = open(business)
    business_db = pd.read_json(db_open, lines = True)    
        
    # set the user_id to be the index
    business_review_nr = business_db.set_index('business_id')
    
    # remove all irrelevant data
    business_review_nr = business_review_nr.reindex(columns = ['review_count'])
    
    return business_review_nr

show_business_review_count = business_review_matrix("data/sun city/business.json")
show_business_review_count.head()

Unnamed: 0_level_0,review_count
business_id,Unnamed: 1_level_1
ol2r325YnfuHcq7yeO3vdg,13
GmIUJmlwf3fJxFm79EZNgw,3
O4TYqnhpXpyrgJBkucaeaw,25
rIhu4bA2CmSicOSJmTpGug,143
2y2d1wJrZAhHI2J7o2ndtg,6


In [11]:
def business_attributes_matrix(business):
    """
    Create a dataframe with business_id as the index and the attributes of the business as the feature
    """
    # open the business.json as a dataframe for use
    db_open = open(business)
    business_db = pd.read_json(db_open, lines = True)    
        
    # set the business_id to be the index
    business_attributes = business_db.set_index('business_id')
    
    # remove all irrelevant data
    business_attributes = business_attributes.reindex(columns = ['attributes'])
    
    return business_attributes

show_business_attributes = business_attributes_matrix("data/sun city/business.json")
show_business_attributes.head()    

Unnamed: 0_level_0,attributes
business_id,Unnamed: 1_level_1
ol2r325YnfuHcq7yeO3vdg,"{'RestaurantsTakeOut': 'True', 'RestaurantsRes..."
GmIUJmlwf3fJxFm79EZNgw,"{'BikeParking': 'True', 'RestaurantsPriceRange..."
O4TYqnhpXpyrgJBkucaeaw,"{'WiFi': ''no'', 'RestaurantsReservations': 'F..."
rIhu4bA2CmSicOSJmTpGug,"{'BusinessAcceptsCreditCards': 'True', 'Busine..."
2y2d1wJrZAhHI2J7o2ndtg,{'BusinessAcceptsCreditCards': 'True'}


In [12]:
def business_categories_matrix(business):
    """
    Create a dataframe with business_id as the index and the categories of the business as the feature
    """
    # open the business.json as a dataframe for use
    db_open = open(business)
    business_db = pd.read_json(db_open, lines = True)    
        
    # set the business_id to be the index
    business_categories = business_db.set_index('business_id')
    
    # remove all irrelevant data
    business_categories = business_categories.reindex(columns = ['categories'])
    
    return business_categories

show_business_categories = business_categories_matrix("data/sun city/business.json")
show_business_categories.head()    

Unnamed: 0_level_0,categories
business_id,Unnamed: 1_level_1
ol2r325YnfuHcq7yeO3vdg,"Sandwiches, Fast Food, Restaurants"
GmIUJmlwf3fJxFm79EZNgw,"Shopping, Pawn Shops"
O4TYqnhpXpyrgJBkucaeaw,"Chicken Wings, Fast Food, Chicken Shop, Restau..."
rIhu4bA2CmSicOSJmTpGug,"Restaurants, Food, Bakeries"
2y2d1wJrZAhHI2J7o2ndtg,"Venues & Event Spaces, Local Services, Event P..."


In [13]:
def prep_review(review, user, business, x, y):
    """
    Make new columns containing the number of reviews associated with the user_id and the business_id for each review.
    Then remove the reviews that do not at least match given values for these columns 
    Finally, add the attributes and categories data from business.json to each review
    Remove the temporary count columns to clean up
    """
    reviews_out = remove_irrelevant_columns_review(load_review(review))
    user_review_nrs = user_review_matrix(user)
    business_review_nrs = business_review_matrix(business)
    business_attributes = business_attributes_matrix(business)
    business_categories = business_categories_matrix(business)
    
    # generate the user_review_count column
    user_count_list = []
    for user in reviews_out['user_id']:
        user_count_list.append(user_review_nrs.loc[user]['review_count'])
        
    reviews_out['user_review_count'] = user_count_list
        
    # generate the business_review_count column
    business_count_list = []
    for business in reviews_out['business_id']:
        business_count_list.append(business_review_nrs.loc[business]['review_count'])

    reviews_out['business_review_count'] = business_count_list
    
    # remove the reviews based on unacceptable users or businesses
    too_few_user = reviews_out[reviews_out['user_review_count'] < x].index
    reviews_out.drop(too_few_user, inplace = True)
    
    too_few_business = reviews_out[reviews_out['business_review_count'] < y].index
    reviews_out.drop(too_few_business, inplace = True)
    
    # generate the attributes column
    attributes_list = []
    for entry in reviews_out['business_id']:
        attributes_list.append(business_attributes.loc[entry]['attributes'])
        
    reviews_out['attributes'] = attributes_list
        
    # generate the categories column
    categories_list = []
    for entry in reviews_out['business_id']:
        categories_list.append(business_categories.loc[entry]['categories'])
    
    reviews_out['categories'] = categories_list
    
    # remove the temporary count columns
    reviews_out = reviews_out.reindex(columns = ['business_id', 'review_id', 'stars', 'user_id', 'attributes', 'categories'])
    
    return reviews_out
        
show_test = prep_review("data/sun city/review.json", "data/sun city/user.json", "data/sun city/business.json", 10, 10)
show_test.head()                

Unnamed: 0,business_id,review_id,stars,user_id,attributes,categories
0,PNzir9TtJAD7U41GwR98-w,IufXUZcwGabRNyRvnFJNzg,5,DAIpUGIsY71noX0wNuc27w,"{'BikeParking': 'False', 'BusinessAcceptsCredi...","Restaurants, Thai"
3,rIhu4bA2CmSicOSJmTpGug,wndb4QQ5vWF40z84iWPFvQ,3,Z1wf19FVzvR57O0T-CdBYw,"{'BusinessAcceptsCreditCards': 'True', 'Busine...","Restaurants, Food, Bakeries"
4,S9BSFX03TBqAHFF1M4c08g,UXok8HepAp9Uh1NEDvboDg,5,HX4IwDr-uvlXVE21lXbrjg,{'BusinessAcceptsCreditCards': 'True'},"Automotive, Auto Repair"
6,PNzir9TtJAD7U41GwR98-w,XFFzBW1p8P7ug-OeVKELZg,5,liWnObQxCckRGAyFbaI7oQ,"{'BikeParking': 'False', 'BusinessAcceptsCredi...","Restaurants, Thai"
7,PNzir9TtJAD7U41GwR98-w,9U6O3SWKEPH52jpHyy1NJg,5,gKX5BAYV81HOtBqBvhuQvg,"{'BikeParking': 'False', 'BusinessAcceptsCredi...","Restaurants, Thai"
