In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import sqlalchemy as db
import csv

from pathlib import Path
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, Integer, Table, Column, MetaData
from sklearn.model_selection import train_test_split

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [7]:
# Create SQLite db
restaurants_db = "restaurants.sqlite"
connector = (f"sqlite:///{restaurants_db}")

# Create engine
engine = create_engine(connector)

# Declare a Base using `automap_base()`
Base = automap_base()

# Use the Base class to reflect the database tables
Base.prepare(engine, reflect=True)

# Print all of the classes (tables) mapped to the Base
Base.classes.keys()

# Create the inspector and connect it to the engine
inspector = inspect(engine)


In [8]:
def str_to_bool(df, col):
#     df[col].notnull()
    return col.map({'True': True, 'yes': True, 1: True, 0: False, 'False': False, 'Na':False, 'no':False,})

# Load business attributes into df
attributes_path = Path('../data_zipped/yelp_business_attributes.csv.zip', low_memory=False)
attributes_df = pd.read_csv(attributes_path)

# Rearrange column names (not columns) due to defect in source data set
col_list = list(attributes_df)
col_list.insert(2, col_list.pop())
attributes_df.columns = col_list

string_cols = ["Alcohol","NoiseLevel","RestaurantsAttire","WiFi","Smoking","BYOBCorkage"]

# Convert 'Na' to 0 and then convert col to int
# Column: RestaurantsPriceRange2, Values: ['Na' '1' '3' '2' '4']
attributes_df["RestaurantsPriceRange2"] = attributes_df["RestaurantsPriceRange2"].replace(to_replace='Na', value=0)

# Convert remaining "Na" to "no", affects these columns:
attributes_df[string_cols] = attributes_df[string_cols].replace(to_replace='Na', value='no')

# For column in the dataframe:
for col in col_list[1:]:
    # Skip category columns for string conversion
    if col not in string_cols:
        # Skip columns already typed boolean 
        if col not in list(attributes_df.select_dtypes(include=['bool']).columns):
            # Convert to boolean
            if(attributes_df.loc[:,col].nunique()) <= 3:
                attributes_df[col] = str_to_bool(attributes_df,attributes_df[col])

    # Eliminate columns with only one distinct value
    if(attributes_df.loc[:,col].nunique()) == 1:
        attributes_df.pop(col)

In [9]:
def reduce_dims(search_string, df):
    # Create a dataframe as a lookup table 
    type_col_df = pd.DataFrame((df.filter(regex=search_string).columns))

    # Create a description column for mapping
    type_col_df["description"] = type_col_df[0].str.replace(search_string,"")
    type_col_df["description"] = type_col_df[0].str.replace('_',"")

    # Create an identity column
    type_col_df['id'] = type_col_df.index

    # Reorder columns so key is in the first position
    type_col_df.insert(0, 'id', type_col_df.pop('id'))

    # Increment by 1 so remaining "false" values can be set to 0 as a placeholder
    type_col_df["id"] = type_col_df["id"] + 1

    # Rename column 0
    type_col_df.rename(columns = {0:'column_name'}, inplace = True)

    # Derive new column names
    new_attr_id = search_string + "TypeId"
    new_attr_type = search_string + "Type"
    
    # for each unique column matching the search string type
    for index, row in type_col_df.iterrows():
        # set a new column with the value of the id from the type columns dataframe
        df.loc[df[row['column_name']] == True, new_attr_id] = row["id"]
        df[new_attr_id] = df[new_attr_id].fillna(0).astype("int")

        # set a new column with the value of the description from the type columns dataframe
        df.loc[df[row['column_name']] == True, new_attr_type] = row["description"]
        df[new_attr_type] = df[new_attr_type].fillna("na")

    # Drop reduced columns
    df = df.drop(type_col_df["column_name"], axis=1)
    
    type_col_df = type_col_df.pop("column_name")

    # Write lookup table to database
    type_col_df.to_sql(new_attr_type, engine, if_exists='replace')
    
    return df


In [10]:
search_strings = ["BusinessParking","Music","Restaurants","GoodForMeal","DietaryRestrictions","BestNights"]
for str in search_strings:
    attributes_df = reduce_dims(str, attributes_df)


In [13]:
# attributes_df
# attributes_df.BestNightsTypeId.unique()
attributes_df.BestNightsType.unique()

array(['na', 'BestNightssaturday', 'BestNightsfriday',
       'BestNightstuesday', 'BestNightsthursday'], dtype=object)

In [None]:
def excl_substr_filter(string, substr):
    return [str for str in string if not 
             any(sub in str for sub in substr)]

# Encode the remaining string columns
encode_list = list(attributes_df.select_dtypes(include=['object']).columns)
substr = ["Type", "Id"]
encode_list = excl_substr_filter(encode_list, substr)
# encode_list.remove("change_date")

print(encode_list)

for col in encode_list:
    attributes_df = encode_cols(col, attributes_df)

['business_id', 'Alcohol', 'NoiseLevel', 'WiFi', 'Smoking', 'BYOBCorkage', 'AgesAllowed']


In [2]:
# # Create SQLite db
# restaurants_db = "restaurants.sqlite"
# connector = (f"sqlite:///{restaurants_db}")

# # Create engine
# engine = create_engine(connector)

# # Create metadata access to the object
# meta_data = db.MetaData(bind=engine)
# db.MetaData.reflect(meta_data)

# # Get the tables from the metadata object
# attributes = meta_data.tables["RestaurantAttributes"]
# restaurants = meta_data.tables["Restaurants"]


In [3]:
# # Select data into dataframe
# sql_join = r"select RestaurantAttributes.*,stars,review_count,is_mexican_restaurant from RestaurantAttributes inner join Restaurants on RestaurantAttributes.business_id = Restaurants.business_id"

# restaurants_df = pd.read_sql(sql_join, engine)
# restaurants_df.pop("business_id")
# restaurants_df.pop("change_date")
# restaurants_df.head()

Unnamed: 0,ByAppointmentOnly,BusinessAcceptsCreditCards,GoodForKids,WheelchairAccessible,BikeParking,Alcohol,HasTV,NoiseLevel,Caters,WiFi,...,RestaurantsType,GoodForMealTypeId,GoodForMealType,DietaryRestrictionsTypeId,DietaryRestrictionsType,BestNightsTypeId,BestNightsType,stars,review_count,is_mexican_restaurant
0,0,0,0,1,0,no,0,no,0,no,...,RestaurantsDeliveryId,12,GoodForMealbrunchId,14,DietaryRestrictionsvegetarianId,12,BestNightssaturdayId,4.5,3,0
1,0,0,0,0,0,no,0,no,0,no,...,RestaurantsDeliveryId,12,GoodForMealbrunchId,14,DietaryRestrictionsvegetarianId,12,BestNightssaturdayId,3.0,12,0
2,0,0,0,0,0,no,0,no,0,no,...,RestaurantsDeliveryId,12,GoodForMealbrunchId,14,DietaryRestrictionsvegetarianId,12,BestNightssaturdayId,3.5,6,0
3,0,0,1,0,0,no,0,no,0,no,...,RestaurantsDeliveryId,12,GoodForMealbrunchId,14,DietaryRestrictionsvegetarianId,12,BestNightssaturdayId,3.0,8,0
4,0,0,0,1,0,no,0,no,0,no,...,RestaurantsDeliveryId,12,GoodForMealbrunchId,14,DietaryRestrictionsvegetarianId,12,BestNightssaturdayId,3.5,25,0


In [6]:
# search_strings = ["BusinessParking","Music","Restaurants","GoodForMeal","DietaryRestrictions","BestNights"]
# for str in search_strings:
#     restaurants_df = reduce_dims(str, restaurants_df)
    
# # Encode the remaining string columns
# encode_list = list(restaurants_df.select_dtypes(include=['object']).columns)
# print(encode_list)

# for col in encode_list:
#     restaurants_df = encode_cols(col, restaurants_df)

[]


In [7]:
# with pd.option_context('display.max_rows', None,):
#     print(restaurants_df.nunique())
    
# restaurants_df.info()
# restaurants_df.head()

ByAppointmentOnly               1
BusinessAcceptsCreditCards      1
GoodForKids                     1
WheelchairAccessible            2
BikeParking                     1
Alcohol                         4
HasTV                           1
NoiseLevel                      4
Caters                          1
WiFi                            3
HappyHour                       1
GoodForDancing                  1
OutdoorSeating                  1
CoatCheck                       1
Smoking                         3
DriveThru                       1
BYOBCorkage                     3
stars                           9
review_count                  375
is_mexican_restaurant           2
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13848 entries, 0 to 13847
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ByAppointmentOnly           13848 non-null  int64  
 1   BusinessAcceptsCreditC

Unnamed: 0,ByAppointmentOnly,BusinessAcceptsCreditCards,GoodForKids,WheelchairAccessible,BikeParking,Alcohol,HasTV,NoiseLevel,Caters,WiFi,HappyHour,GoodForDancing,OutdoorSeating,CoatCheck,Smoking,DriveThru,BYOBCorkage,stars,review_count,is_mexican_restaurant
0,2,2,2,1,2,1,2,1,2,1,2,2,2,2,1,2,1,4.5,3,0
1,2,2,2,2,2,1,2,1,2,1,2,2,2,2,1,2,1,3.0,12,0
2,2,2,2,2,2,1,2,1,2,1,2,2,2,2,1,2,1,3.5,6,0
3,2,2,2,2,2,1,2,1,2,1,2,2,2,2,1,2,1,3.0,8,0
4,2,2,2,1,2,1,2,1,2,1,2,2,2,2,1,2,1,3.5,25,0


In [8]:
# restaurants_df.stars.value_counts()

3.5    3581
4.0    3194
3.0    2950
2.5    1485
4.5    1363
2.0     678
5.0     333
1.5     209
1.0      55
Name: stars, dtype: int64

In [9]:
# columns = restaurants_df.columns.values.tolist()
# columns.remove("stars")

# target = ["stars"]


In [23]:
def encode_cols (col_name, df):
    '''
    This function dynamically encodes dimensions by taking in a dataframe and a column name. A 
    lookup table mapping the unique values to ID's and description is created. The Id's and 
    descriptions are then applied to the original column in the dataframe.
    '''
    # Create a dataframe as a lookup table 
    type_col_df = pd.DataFrame(df[col_name].unique())

    # Create an identity column
    type_col_df['id'] = type_col_df.index

    # Reorder columns so key is in the first position
    type_col_df.insert(0, 'id', type_col_df.pop('id'))

    # Increment by 1 so remaining "false" values can be set to 0 as a placeholder
    type_col_df["id"] = type_col_df["id"] + 1

    # Rename column 0
    type_col_df.rename(columns = {0:'description'}, inplace = True)

    new_attr_type = col_name + "Type"
    
    # Write lookup table to database
    type_col_df.to_sql(new_attr_type, engine, if_exists='replace')

    # for each unique column matching the search string type
    for index, row in type_col_df.iterrows():
        # set the column to the value of the id from the type columns dataframe
        df.loc[df[col_name] == row["description"], col_name] = row["id"]
        df[col_name] = df[col_name].fillna(0)

    return df