In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df_rating = pd.read_csv("data/full_restaurant_rating_data.csv")

In [3]:
# check
df_rating['price_category'].unique()

array(['mid range', 'fine dining', 'cheap eats', nan], dtype=object)

In [4]:
df_rating["has_price_category"] = np.where(df_rating['price_category'].isna(), 0, 1)

In [5]:
# check
df_rating[df_rating["has_price_category"]==0].price_category.unique()

array([nan], dtype=object)

In [6]:
# check
df_rating[df_rating["has_price_category"]==1].price_category.unique()

array(['mid range', 'fine dining', 'cheap eats'], dtype=object)

In [7]:
category_mapper = {np.nan: 0, "cheap eats":1, "mid range":2, "fine dining":3}
df_rating["ordinal_price_category"] = df_rating["price_category"].map(category_mapper)

In [8]:
# check
df_rating["ordinal_price_category"].unique()

array([2, 3, 1, 0], dtype=int64)

Nearby Attraction

In [9]:
df_rating["nearby_attraction"] = df_rating['nearby_attraction'].replace(to_replace =['Asian Civilisations Museum', 'National Museum of Singapore', 'City Hall Building'], value ="Museum")
df_rating["nearby_attraction"] = df_rating['nearby_attraction'].replace(to_replace =['National Orchid Garden', 'Singapore Botanic Gardens'], value ="Garden")
df_rating["nearby_attraction"] = df_rating['nearby_attraction'].replace(to_replace =['Singapore Cable Car', 'Singapore Flyer'], value ="Observation")
df_rating["nearby_attraction"] = df_rating['nearby_attraction'].replace(to_replace =['Night Safari', 'Singapore Zoo'], value ="Wildlife")

In [10]:
df_rating['nearby_attraction'] = df_rating['nearby_attraction'].str.replace(" ", "_")
df_rating['nearby_attraction'] = df_rating['nearby_attraction'].str.lower()

In [11]:
# count by nearby attraction check
df_rating.groupby(['nearby_attraction'], dropna=False)['url'].count()

nearby_attraction
arab_street         257
boat_quay           165
chinatown          1290
east_coast_park     948
garden             1272
little_india       1293
marina_bay          455
merlion_park        370
museum              502
observation         517
orchard_road        741
wildlife            533
NaN                2818
Name: url, dtype: int64

In [12]:
median_by_attraction = df_rating.groupby(['nearby_attraction']).agg({'distance_from_attraction': ['min', 'max', 'median']})['distance_from_attraction']['median'].to_dict()
df_rating["above_median"] =  df_rating[['distance_from_attraction', 'nearby_attraction']].apply(lambda x: x['distance_from_attraction'] > median_by_attraction[x['nearby_attraction']] if not pd.isna(x['nearby_attraction']) else np.nan, axis=1)
df_rating["above_median"] = df_rating["above_median"].astype(float)

In [13]:
df_rating["above_median"].unique()

array([ 1.,  0., nan])

In [14]:
from sklearn.preprocessing import OneHotEncoder # categorical variable so do one hot encoding
nearby_attraction_enc = OneHotEncoder(categories='auto')
# One hot encoding for categorical variable
features_nearby_attraction_train = nearby_attraction_enc.fit_transform(df_rating[['nearby_attraction']])

# TODO transform on test

In [15]:
nearby_attraction_ohe_columns = nearby_attraction_enc.get_feature_names_out(['nearby_attraction'])
nearby_attraction_ohe_df = pd.DataFrame(features_nearby_attraction_train.toarray(), columns=nearby_attraction_ohe_columns)
bucket_ohe_df = pd.concat([df_rating.drop(['nearby_attraction'], axis=1), nearby_attraction_ohe_df], axis=1)

In [16]:
cross_cols = []
bucket_ohe_df["above_median"] = bucket_ohe_df["above_median"].replace(np.nan, 0)
for col in nearby_attraction_ohe_columns:
    if col == 'nearby_attraction_nan':
        continue
    cross = col + "_above_median"
    bucket_ohe_df[cross] = bucket_ohe_df[col] * bucket_ohe_df["above_median"]
    cross_cols.append(cross)

Sanity Checks

In [17]:
for i in nearby_attraction_ohe_columns:
    print(bucket_ohe_df[i].unique())

[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]


In [18]:
for i in cross_cols:
    print(bucket_ohe_df[i].unique())

[0. 1.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]
[0. 1.]


In [19]:
bucket_ohe_df.columns

Index(['name', 'address', 'region', 'latitude', 'longitude',
       'distance_from_attraction', 'price_category', 'price_range',
       'mon_operating_hours', 'tue_operating_hours', 'wed_operating_hours',
       'thu_operating_hours', 'fri_operating_hours', 'sat_operating_hours',
       'sun_operating_hours', 'is_travellors_choice', 'is_michellin', 'about',
       'list_of_meals', 'list_of_cuisines', 'list_of_features',
       'list_of_special_diets', 'num_of_reviews', 'num_of_rating_1',
       'num_of_rating_2', 'num_of_rating_3', 'num_of_rating_4',
       'num_of_rating_5', 'food_rating', 'service_rating', 'value_rating',
       'atmosphere_rating', 'overall_rating', 'url', 'has_price_category',
       'ordinal_price_category', 'above_median',
       'nearby_attraction_arab_street', 'nearby_attraction_boat_quay',
       'nearby_attraction_chinatown', 'nearby_attraction_east_coast_park',
       'nearby_attraction_garden', 'nearby_attraction_little_india',
       'nearby_attraction_mar