In [None]:
### IMPORTS ###

In [None]:
# Libraries
import pandas as pd 
import numpy as np 
import load_datasets
import re
import ast
import geopandas as gpd
from shapely.geometry import Point

In [None]:
# County Data - Load county shapefile (California only)
counties = gpd.read_file("resources/cb_2018_us_county_500k.shp")
counties_ca = counties[counties["STATEFP"] == "06"]  # 06 = California
counties_ca = counties_ca.sort_values("NAME").reset_index(drop=True)
counties_ca["COUNTY_NUM"] = counties_ca.index

In [None]:
# Import data into notebook
cafes, users, reviews = load_datasets.load_table_data()
ratings, user2cafes, cafes2users = load_datasets.create_user_review_dicts(reviews)

In [2]:
### SECTION 1 - DATA ANALYTICS ###
# This section looks at various statistics about the data 

In [106]:
# Basics:

# Get number of users
print("Number of Users: ", len(users))
# Get number of cafes
print("Number of Cafes: ", len(cafes))
# Get number of reviews
print("Number of Reviews: ", len(reviews))

# Format of User Data
print("\nUSER EXAMPLE DATA: \n", users.iloc[0,:], "\n")
# Format of Cafe Data
print("CAFE EXAMPLE DATA: \n", cafes.iloc[0,:], "\n")
# Format of Review 
print("REVIEW EXAMPLE DATA: \n", reviews.iloc[0,:], "\n")

Number of Users:  196454
Number of Cafes:  15576
Number of Reviews:  1769673

USER EXAMPLE DATA: 
 user_id        100000041656879737279
num_reviews                        7
Name: 0, dtype: object 

CAFE EXAMPLE DATA: 
 gmap_id                       0x80dc976f028eb61d:0x1a5ed32889a67122
name                                                       Circle K
latitude                                                  33.689862
longitude                                               -117.376151
category          ['Convenience store', 'ATM', 'Coffee shop', 'C...
avg_rating                                                      3.5
num_of_reviews                                                   24
price                                                             $
hours             [['Thursday', 'Open 24 hours'], ['Friday', 'Op...
Name: 0, dtype: object 

REVIEW EXAMPLE DATA: 
 gmap_id                  0x80dc976f028eb61d:0x1a5ed32889a67122
user_id                                  10898487406889310

In [107]:
# Functions:

# Average rating
def avg_rating(df):
    return sum(df['rating']) / len(df['rating'])

# Average price
def avg_price(df):
    return sum(df['price']) / len(df['price'])

# Converts time text to values
def parse_time(t):
    t = t.strip().upper()

    # Match hh or hh:mm formats
    m = re.match(r"(\d{1,2})(?::(\d{2}))?(AM|PM)", t)
    if not m:
        raise ValueError(f"Invalid time format: {t}")

    hour = int(m.group(1))
    minute = int(m.group(2) or 0)
    period = m.group(3)

    # Convert to 24-hour
    if period == "AM":
        if hour == 12:
            hour = 0
    else:  # PM
        if hour != 12:
            hour += 12

    return hour + minute / 60.0

# County Mapping Based on Latitude & Longitude
def get_county(lat, lon):
    point = Point(lon, lat)  # geometry expects (lon, lat)
    matches = counties_ca[counties_ca.contains(point)]
    
    if len(matches) > 0:
        return int(matches.iloc[0]["COUNTY_NUM"])
    return None


In [None]:
### SECTION 2 - Data Processing ###
# This section processes the data to prepare it for the model

In [108]:
# Bin Encoding for Hours Throughout Week
def hours_to_onehot(datum):
    feature_hours = [0]*24*7        # Feature vector is 168 wide (24 hours by 7 days)
    hours = ast.literal_eval(datum['hours'])
    
    weekday_map = {
        "Monday" : 0,
        "Tuesday" : 1,
        "Wednesday" : 2,
        "Thursday" : 3,
        "Friday" : 4,
        "Saturday" : 5,
        "Sunday" :6
    }

    for entry in hours:
        day = weekday_map[entry[0]] # Gets day of week number

        # Closed, skip entry
        if entry[1] == "Closed":
            continue

        # Open 24 hours, all ones
        if entry[1] == "Open 24 hours":
            for d in range(day*24,day*24+24): feature_hours[d] = 1
            continue
        
        # Converts entry to range of hours
        open_str, close_str = entry[1].split("â€“")
        start_hr = int(np.floor(parse_time(open_str)))
        end_hr = int(np.ceil(parse_time(close_str)))

        for hr in range(start_hr, end_hr):
            index = day * 24 + hr
            feature_hours[index] = 1

    return feature_hours


# One Hot Encoding for Price
def price_to_onehot(datum):
    feature_price = [0]*3
    if datum['price'] is not np.nan:
        feature_price[len(datum['price'])-1] += 1
    return feature_price

# One Hot Encoding for County
def county_to_onehot(datum):
    # To check functionality, chose entry 863 which is SFO - should map to San Mateo County
    feature_county =[0]*58  # 58 counties in Cali

    index = get_county(datum['latitude'],datum['longitude'])
    feature_county[index] = 1
    
    return feature_county