## Feature Engineering

In this notebook, we generate additional features that will help us come up with a more predictive model.

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import datetime
import os

from sklearn.linear_model import LinearRegression

In [2]:
# Read unprocessed data and reviews
df_unprocessed = pd.read_pickle('../datasets/unprocessed_clean/businesses_data_cleaned.pkl')
df_reviews = pd.read_pickle('../datasets/unprocessed_clean/businesses_reviews_cleaned.pkl')

In [3]:
df_unprocessed.head()

Unnamed: 0,business_id,business_name,categories,city,full_address,display_phone,review_count,stars,price_tag,is_claimed,is_closed,image,url,latitude,longitude
0,5292a3ee-c57b-4c55-b882-1eebbafbb69c,Taverna Fevan,"[Mediterranean, European, Albanian]",Vlore County,"Palase, Dhermi Albania",+355 68 330 0908,134.0,4.0,2.5,True,False,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,40.160217,19.628233
1,777dcd53-bef2-45f0-a4f3-aade07195536,Taverna Hibraj,"[Barbecue, European, Albanian]",Vlore County,"SH8, Llogara, Llogara National Park Albania",+355 69 616 1807,22.0,4.5,1.0,True,False,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,40.20997,19.579586
2,7dadd1a8-f799-459b-bad1-dba165669780,Calme Palase,"[Bar, Cafe, European, Pub]",Vlore County,"Rruga E Plazhit Te Palases, Palase Albania",,3.0,2.5,1.0,False,False,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,40.170204,19.583406
3,a6c050df-cba9-41b3-a07e-01581c7977cd,Ambel,"[Italian, Seafood]",Vlore County,"Rruga Perivolo, Dhermi Albania",+355 69 209 9890,1.0,5.0,1.0,False,False,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,40.15139,19.63889
4,c722445d-a16f-49cf-a6a6-b4fd15c58f29,Brother's Grill Fastfood,"[Fast, Food, European, Grill, Albanian]",Vlore County,"Sh8, Himare 9425 Albania",+355 69 539 9818,56.0,5.0,1.0,True,False,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,40.101753,19.74574


In [4]:
df_reviews.head()

Unnamed: 0,user_id,business_id,review_id,review_date,review_title,review_text,rating,votes
0,UID_649D7F06246AE713EC03CC37D8E0F727-SRC_86857...,c65742e9-3028-4a4e-a17d-397810611bdc,868577404,2022-11-15,Delicious food in a lovely setting,We’ve been here a few times. This was the best...,5.0,0.0
1,UID_5C8A05641E93AF599D3A0E5065AD9C8C-SRC_85498...,c65742e9-3028-4a4e-a17d-397810611bdc,854983901,2022-08-18,Too bad,Dim lights and waiters in traditional costume ...,2.0,0.0
2,UID_28D535572F72A35D95F49E6A0A643807-SRC_80282...,c65742e9-3028-4a4e-a17d-397810611bdc,802820824,2021-08-09,What a find...,Stopped off here on the way from Skopje to Ohr...,5.0,0.0
3,UID_45EDCADA9084DAB445B72166A63C556B-SRC_69536...,c65742e9-3028-4a4e-a17d-397810611bdc,695360093,2019-08-04,Tasty!,We have meal there after visiting Monastery. T...,5.0,0.0
4,UID_37C2FB670F0B798CE66E1C24F7CDF308-SRC_69395...,c65742e9-3028-4a4e-a17d-397810611bdc,693953389,2019-07-29,Perfect end to the Monastery visit,After the trip to the Monastery we stopped in ...,5.0,0.0


In [5]:
df_unprocessed.columns

Index(['business_id', 'business_name', 'categories', 'city', 'full_address',
       'display_phone', 'review_count', 'stars', 'price_tag', 'is_claimed',
       'is_closed', 'image', 'url', 'latitude', 'longitude'],
      dtype='object')

Generating features using the reviews metadata

In [6]:
df_unprocessed['reviews_per_week'] = np.nan
df_unprocessed['oldest_review'] = np.nan
df_unprocessed['std_of_stars'] = np.nan
df_unprocessed['median_of_stars'] = np.nan
df_unprocessed['votes_per_week'] = np.nan
df_unprocessed['stars_linear_coef'] = np.nan

regression = LinearRegression()

oldest_review = df_reviews['review_date'].max()
for i, business in df_unprocessed.iterrows():
    reviews = df_reviews[df_reviews['business_id'] == business['business_id']]
    number_reviews = len(reviews)
    delta_time = 0.0
    weeks_since_older_review = 1.0
    number_votes = 0.0
    data = np.array([])
    regression_coefs = 0.0
    std_stars = 0.0
    meadian_stars = 0.0
    if not reviews.empty:
        #print(min(reviews['review_date']))
        delta_time = (max(reviews['review_date']) - min(reviews['review_date'])).days/7
        
        # proxt for oldness
        weeks_since_older_review = (oldest_review - min(reviews['review_date'])).days/7
        
        # find number of votes
        for j, review in reviews.iterrows():
            number_votes += review['votes']
            
        dates = [d.toordinal() for d in reviews['review_date']] # convert dates to integer
        dates = np.asarray(dates)
        # find regression coefficients of dates and rating
        regression.fit(dates.reshape(-1, 1), reviews['rating'].values.reshape(-1, 1))
        regression_coefs = regression.coef_[0][0]
    
        # get std and median of stars
        std_stars = np.nanstd(reviews['rating'])
        median_stars = np.nanmedian(reviews['rating'])
        
    
    # Apply features to our dataframe
    df_unprocessed.loc[i, 'reviews_per_week'] = number_reviews/ weeks_since_older_review
    df_unprocessed.loc[i, 'oldest_review'] = 0 if reviews.empty else weeks_since_older_review
    df_unprocessed.loc[i, 'std_of_stars'] = std_stars
    df_unprocessed.loc[i, 'median_of_stars'] = median_stars
    df_unprocessed.loc[i, 'votes_per_week'] = number_votes / weeks_since_older_review
    df_unprocessed.loc[i, 'stars_linear_coef'] = regression_coefs

In [7]:
df_unprocessed.head(1)

Unnamed: 0,business_id,business_name,categories,city,full_address,display_phone,review_count,stars,price_tag,is_claimed,...,image,url,latitude,longitude,reviews_per_week,oldest_review,std_of_stars,median_of_stars,votes_per_week,stars_linear_coef
0,5292a3ee-c57b-4c55-b882-1eebbafbb69c,Taverna Fevan,"[Mediterranean, European, Albanian]",Vlore County,"Palase, Dhermi Albania",+355 68 330 0908,134.0,4.0,2.5,True,...,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,40.160217,19.628233,0.177365,84.571429,1.549193,5.0,0.153716,-0.000859


Now we will make a dictionary of restaurant categories that will help us calculate relative quantities. The keys are the restaurant categories and the values are the indices of the restaurants that belong in each category.

In [8]:
category_dict = {}
for i, business in df_unprocessed.iterrows():
    for category in business['categories']:
        try:
            category_dict[category] += [i]
        except:
            category_dict[category] = [i]

Here, we will calculate restaurant density within 1.6 km radius and other relative quantities. The z* quantities are relative to the equivalent quantities of the surrounding restaurants.

In [9]:
def distance(ilat, jlat, ilong, jlong):
    R = 6371.e3 # earth radius
    phi1 = math.radians(ilat)
    phi2 = math.radians(jlat)
    delta_phi = math.radians(jlat-ilat)
    delta_lambda = math.radians(jlong-ilong)
    a = math.sin(delta_phi/2)**2 + math.cos(phi1)*math.cos(phi2) * math.sin(delta_lambda/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = R*c # meters
    return d

In [None]:
df_unprocessed['business_density'] = np.nan
df_unprocessed['business_similar_density'] = np.nan
df_unprocessed['zprice_all'] = np.nan
df_unprocessed['zreview_count_all'] = np.nan
df_unprocessed['zreview_per_week_all'] = np.nan
df_unprocessed['zstar_all'] = np.nan
df_unprocessed['is_chain'] = np.nan

for i, business in df_unprocessed.iterrows():
    price_all = []
    review_count_all = []
    review_per_week_all = []
    star_all = []
    density_similar = []
    density_all = []
    
    ilong = business['longitude']
    ilat = business['latitude']
    
    for category in business['categories']:
        for business_category in category_dict[category]:
            jlong = df_unprocessed.loc[business_category]['longitude']
            jlat = df_unprocessed.loc[business_category]['latitude']
            dist = distance(ilat,jlat,ilong,jlong)
            # if less than 1600 meters or 1.6 km (include self)
            # certain restaurants will be double counted if they belong to the same categories but
            # that's ok as it adds a weight for very similar restaurants
            if dist <= 1609.34: # meters
                density_similar += [business_category]
    
    # Loop over all resturants to calculate the "all" properties
    for j, business_j in df_unprocessed.iterrows():
        jlong = business_j['longitude']
        jlat = business_j['latitude']
        dist = distance(ilat, jlat, ilong, jlong)
        
        if dist <= 1609.34:
            price_all += [business_j['price_tag']]
            review_count_all += [business_j['review_count']]
            review_per_week_all += [business_j['reviews_per_week']]
            star_all += [business_j['stars']]
            density_all += [j]
            
    # price and stars are not divided by standard deviation because it is often equal to 0
    df_unprocessed.loc[i,'zprice_all'] = (business['price_tag']-np.nanmean(price_all))/4.
    
    zreview_count_all = 0.0
    if np.nanstd(review_count_all) > 0:
        zreview_count_all = (business['review_count']-np.nanmean(review_count_all))/np.nanstd(review_count_all)
    df_unprocessed.loc[i,'zreview_count_all'] = zreview_count_all 
    
    zreview_per_week_all = 0.0
    if np.nanstd(zreview_per_week_all) > 0:
        zreview_per_week_all = (business['zreview_per_week']-np.nanmean(zreview_per_week_all))/np.nanstd(zreview_per_week_all)
    df_unprocessed.loc[i,'zreview_per_week_all'] = zreview_per_week_all
    
    df_unprocessed.loc[i,'zstar_all'] = (business['stars']-np.nanmean(star_all))/5.    
    df_unprocessed.loc[i,'business_density'] = len(density_all)
    df_unprocessed.loc[i, 'business_similar_density'] = len(density_similar)
    # True if there are more than one
    df_unprocessed.loc[i,'is_chain'] = (len(df_unprocessed[df_unprocessed['business_name'] == business['business_name']]) > 1)    

In [None]:
df_unprocessed.columns

In [None]:
df_processed = df_unprocessed

In [None]:
df_processed.head(1)

In [None]:
df_processed[df_processed['is_chain'] == True][['business_name', 'is_chain']]

#### Save
Saving the datasets with all the features

In [None]:
os.makedirs('../datasets/processed_clean', exist_ok=True)
df_processed.to_pickle('../datasets/processed_clean/business_data_processed.pkl')