In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as db
import csv

from pathlib import Path
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, Integer, Table, Column, MetaData

In [2]:
# Create SQLite db
restaurants_db = "restaurants.sqlite"
connector = (f"sqlite:///{restaurants_db}")

# Create engine
engine = create_engine(connector)

# Create metadata access to the object
meta_data = db.MetaData(bind=engine)
db.MetaData.reflect(meta_data)

# Get the tables from the metadata object
attributes = meta_data.tables["RestaurantAttributes"]
restaurants = meta_data.tables["Restaurants"]


In [3]:
def reduce_dims(search_string, df):
    '''
    This function dynamically reduces dimensions by taking in a dataframe and an input string as a search pattern.
    The search pattern is applied to the dataframe columns to create a lookup table mapping the columns to ID's and 
    descriptions.  The Id's and descriptions are then applied as new columns in the dataframe.
    '''
    # Create a dataframe as a lookup table 
    type_col_df = pd.DataFrame((df.filter(regex=search_string).columns))

    # Create a description column for mapping
    type_col_df["description"] = type_col_df[0].str.replace(search_string,"")
    type_col_df["description"] = type_col_df[0].str.replace('_',"")

    # Create an identity column
    type_col_df['id'] = type_col_df.index

    # Reorder columns so key is in the first position
    type_col_df.insert(0, 'id', type_col_df.pop('id'))

    # Increment by 1 so remaining "false" values can be set to 0 as a placeholder
    type_col_df["id"] = type_col_df["id"] + 1

    # Rename column 0
    type_col_df.rename(columns = {0:'column_name'}, inplace = True)

    # Derive new column names
    new_attr_id = search_string + "TypeId"
    new_attr_type = search_string + "Type"
    
    # for each unique column matching the search string type
    for index, row in type_col_df.iterrows():
        # set a new column with the value of the id from the type columns dataframe
        df.loc[df[row['column_name']] == True, new_attr_id] = row["id"]
        df[new_attr_id] = df[new_attr_id].fillna(0).astype("int")

#         # set a new column with the value of the description from the type columns dataframe
#         df.loc[attributes_df[row['column_name']] == True, new_attr_type] = row["description"]
#         df[new_attr_type] = df[new_attr_type].fillna("na")

    # Drop reduced columns
    df = df.drop(type_col_df["column_name"], axis=1)
    
    type_col_df = type_col_df.pop("column_name")

    # Write lookup table to database
    type_col_df.to_sql(new_attr_type, engine, if_exists='replace')
    
    return df


In [4]:
def encode_cols (col_name, df):
    '''
    This function dynamically encodes dimensions by taking in a dataframe and a column name. A 
    lookup table mapping the unique values to ID's and description is created. The Id's and 
    descriptions are then applied to the original column in the dataframe.
    '''
    # Create a dataframe as a lookup table 
    type_col_df = pd.DataFrame(df[col_name].unique())

    # Create an identity column
    type_col_df['id'] = type_col_df.index

    # Reorder columns so key is in the first position
    type_col_df.insert(0, 'id', type_col_df.pop('id'))

    # Increment by 1 so remaining "false" values can be set to 0 as a placeholder
    type_col_df["id"] = type_col_df["id"] + 1

    # Rename column 0
    type_col_df.rename(columns = {0:'description'}, inplace = True)

    new_attr_type = col_name + "Type"
    
    # Write lookup table to database
    type_col_df.to_sql(new_attr_type, engine, if_exists='replace')

    # for each unique column matching the search string type
    for index, row in type_col_df.iterrows():
        # set the column to the value of the id from the type columns dataframe
        df.loc[df[col_name] == row["description"], col_name] = row["id"]
        df[col_name] = df[col_name].fillna(0)

    return df

In [5]:
# Select data into dataframe
sql_join = r"select RestaurantAttributes.*,stars,review_count,is_mexican_restaurant from RestaurantAttributes inner join Restaurants on RestaurantAttributes.business_id = Restaurants.business_id"

restaurants_df = pd.read_sql(sql_join, engine)
restaurants_df.pop("business_id")
restaurants_df.pop("change_date")
restaurants_df.head()

Unnamed: 0,ByAppointmentOnly,BusinessAcceptsCreditCards,BusinessParking_garage,BusinessParking_street,BusinessParking_validated,BusinessParking_lot,BusinessParking_valet,RestaurantsPriceRange2,GoodForKids,WheelchairAccessible,...,DietaryRestrictions_dairy-free,DietaryRestrictions_gluten-free,DietaryRestrictions_vegan,DietaryRestrictions_kosher,DietaryRestrictions_halal,DietaryRestrictions_soy-free,DietaryRestrictions_vegetarian,stars,review_count,is_mexican_restaurant
0,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,4.5,3,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.0,12,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.5,6,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,3.0,8,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,3.5,25,0


In [6]:
## Dimensionality Reduction
# Collapse into categorical attributes:
    # BusinessParkingType:
        # BusinessParking_garage                 2
        # BusinessParking_street                 2
        # BusinessParking_validated              2
        # BusinessParking_lot                    2
        # BusinessParking_valet                  2
    # MusicType:
        # Music_dj                               2
        # Music_karaoke                          2        
    # Restaurants:
        # RestaurantsReservations                2
        # RestaurantsTakeOut                     2
        # RestaurantsDelivery                    2
        # RestaurantsGoodForGroups               2
    # GoodForMealType:
        # GoodForMeal_dessert                    2
        # GoodForMeal_latenight                  2
        # GoodForMeal_lunch                      2
        # GoodForMeal_dinner                     2
        # GoodForMeal_breakfast                  2
        # GoodForMeal_brunch                     2
    # DietaryRestrictionsType:
        # DietaryRestrictions_dairy-free         2
        # DietaryRestrictions_gluten-free        2
        # DietaryRestrictions_vegan              2
        # DietaryRestrictions_kosher             2
        # DietaryRestrictions_halal              2
        # DietaryRestrictions_soy-free           2
        # DietaryRestrictions_vegetarian         2
    # BestNightsType:
        # BestNights_monday                      2
        # BestNights_friday                      2
        # BestNights_wednesday                   2
        # BestNights_thursday                    2
        # BestNights_sunday                      2
        # BestNights_saturday                    2

search_strings = ["BusinessParking","Music","Restaurants","GoodForMeal","DietaryRestrictions","BestNights"]
for str in search_strings:
    restaurants_df = reduce_dims(str, restaurants_df)
    
# Encode the remaining string columns
encode_list = list(restaurants_df.select_dtypes(include=['object']).columns)
print(encode_list)

for col in encode_list:
    restaurants_df = encode_cols(col, restaurants_df)

['Alcohol', 'NoiseLevel', 'WiFi', 'Smoking', 'BYOBCorkage']


In [7]:
with pd.option_context('display.max_rows', None,):
    print(restaurants_df.nunique())
    
restaurants_df.info()
restaurants_df.head()

ByAppointmentOnly               2
BusinessAcceptsCreditCards      2
GoodForKids                     2
WheelchairAccessible            2
BikeParking                     2
Alcohol                         4
HasTV                           2
NoiseLevel                      4
Caters                          2
WiFi                            3
HappyHour                       2
GoodForDancing                  2
OutdoorSeating                  2
CoatCheck                       2
Smoking                         3
DriveThru                       2
BYOBCorkage                     3
stars                           9
review_count                  375
is_mexican_restaurant           2
BusinessParkingTypeId           6
MusicTypeId                     3
RestaurantsTypeId               6
GoodForMealTypeId               7
DietaryRestrictionsTypeId       3
BestNightsTypeId                3
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13848 entries, 0 to 13847
Data columns (total 26 colu

Unnamed: 0,ByAppointmentOnly,BusinessAcceptsCreditCards,GoodForKids,WheelchairAccessible,BikeParking,Alcohol,HasTV,NoiseLevel,Caters,WiFi,...,BYOBCorkage,stars,review_count,is_mexican_restaurant,BusinessParkingTypeId,MusicTypeId,RestaurantsTypeId,GoodForMealTypeId,DietaryRestrictionsTypeId,BestNightsTypeId
0,0,0,0,1,0,1,0,1,0,1,...,1,4.5,3,0,4,0,0,0,0,0
1,0,0,0,0,0,1,0,1,0,1,...,1,3.0,12,0,2,0,0,0,0,0
2,0,0,0,0,0,1,0,1,0,1,...,1,3.5,6,0,0,0,3,0,0,0
3,0,0,1,0,0,1,0,1,0,1,...,1,3.0,8,0,0,0,0,0,0,0
4,0,0,0,1,0,1,0,1,0,1,...,1,3.5,25,0,4,0,0,0,0,0
