## Feature Engineering

In this notebook, we generate additional features that will help us come up with a more predictive model.

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import datetime
import os

from sklearn.linear_model import LinearRegression

In [2]:
# Read unprocessed data and reviews
df_unprocessed = pd.read_pickle('../datasets/unprocessed_clean/businesses_data_cleaned.pkl')
df_reviews = pd.read_pickle('../datasets/unprocessed_clean/businesses_reviews_cleaned.pkl')

In [3]:
df_unprocessed.head()

Unnamed: 0,business_id,business_name,categories,city,full_address,display_phone,review_count,stars,price_tag,is_claimed,is_closed,image,url,latitude,longitude
0,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,La Pizza Nostra,[Pizza],Tirana County,"Rruga Perlat Rexhepi, Tirana Albania",+355 69 309 9999,15.0,5.0,1.0,True,False,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,41.318115,19.817093
1,360fcd61-8ed6-4e98-bd4d-9ab50fa8c290,Cioccolatitaliani Kalaja,[],Tirana County,"Rruga Murat Toptani, Tirana 1001 Albania",,12.0,3.5,4.0,True,False,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,41.325882,19.8225
2,f9912145-1a1a-41fe-89c6-243e37825eb6,Baza Bar,[Fast Food],Tirana County,"Rruga Gjin Bue Shpata 10, Tirana 1001, Tirana ...",+355 69 725 4485,0.0,0.0,1.0,False,True,,https://www.tripadvisor.com/Restaurant_Review-...,41.320923,19.811533
3,55a1094a-84fe-42de-b3a2-e5aed8a543ac,Bar Restorant Piceri Colombo,"[Italian, Mediterranean, European, Greek]",Tirana County,"Rr. Reshit Petrela, Ish Stacioni Trenit, 100m ...",+355 69 693 7666,5.0,3.0,1.0,False,False,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,41.335743,19.81529
4,021324f7-7875-430f-9026-18ebaece43e0,Casa della Pasta,"[Italian, European, Albanian]",Tirana County,"Rruga Halim Xhelo 11, Tirana 1023 Albania",+355 68 908 3863,3.0,4.0,1.0,True,False,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,41.32788,19.806349


In [4]:
df_reviews.head()

Unnamed: 0,user_id,business_id,review_id,review_date,review_title,review_text,rating,votes
0,UID_64C07AC09E188BCE7A081939BFDAD66B-SRC_77836...,43cae344-9533-484c-bc4e-c99095cd2099,778365530,2020-12-03,No title,No text,5.0,0.0
1,UID_0FE0E6284F6C99666368DE11341334C1-SRC_50595...,43cae344-9533-484c-bc4e-c99095cd2099,505951496,2017-07-26,No title,No text,4.0,0.0
2,UID_42249550DB6692C78F9823BBE41F7D29-SRC_49966...,43cae344-9533-484c-bc4e-c99095cd2099,499660609,2017-07-08,No title,No text,3.0,0.0
3,UID_53D877480BD8A0719BC4FEC97040DD59-SRC_79131...,8eeec4a0-4825-40ce-8169-f0a20c2d3410,791318013,2021-06-05,Excellent food at affordable price,The best restaurant in Shengjin. Fresh fish d...,5.0,1.0
4,UID_88FF89326964E5FA9090E8BD9D292620-SRC_15419...,8eeec4a0-4825-40ce-8169-f0a20c2d3410,154191986,2013-03-10,Place for fresh seafood,If you really want to have FRESH seafood this ...,3.0,1.0


In [5]:
df_unprocessed.columns

Index(['business_id', 'business_name', 'categories', 'city', 'full_address',
       'display_phone', 'review_count', 'stars', 'price_tag', 'is_claimed',
       'is_closed', 'image', 'url', 'latitude', 'longitude'],
      dtype='object')

Generating features using the reviews metadata

In [6]:
df_unprocessed['reviews_per_week'] = np.nan
df_unprocessed['oldest_review'] = np.nan
df_unprocessed['std_of_stars'] = np.nan
df_unprocessed['median_of_stars'] = np.nan
df_unprocessed['votes_per_week'] = np.nan
df_unprocessed['stars_linear_coef'] = np.nan

regression = LinearRegression()

oldest_review = df_reviews['review_date'].max()
for i, business in df_unprocessed.iterrows():
    reviews = df_reviews[df_reviews['business_id'] == business['business_id']]
    number_reviews = len(reviews)
    delta_time = 0.0
    weeks_since_older_review = 1.0
    number_votes = 0.0
    data = np.array([])
    regression_coefs = 0.0
    std_stars = 0.0
    meadian_stars = 0.0
    if not reviews.empty:
        #print(min(reviews['review_date']))
        delta_time = (max(reviews['review_date']) - min(reviews['review_date'])).days/7
        
        # proxt for oldness
        weeks_since_older_review = (oldest_review - min(reviews['review_date'])).days/7
        
        # find number of votes
        for j, review in reviews.iterrows():
            number_votes += review['votes']
            
        dates = [d.toordinal() for d in reviews['review_date']] # convert dates to integer
        dates = np.asarray(dates)
        # find regression coefficients of dates and rating
        regression.fit(dates.reshape(-1, 1), reviews['rating'].values.reshape(-1, 1))
        regression_coefs = regression.coef_[0][0]
    
        # get std and median of stars
        std_stars = np.nanstd(reviews['rating'])
        median_stars = np.nanmedian(reviews['rating'])
        
    
    # Apply features to our dataframe
    df_unprocessed.loc[i, 'reviews_per_week'] = number_reviews/ weeks_since_older_review
    df_unprocessed.loc[i, 'oldest_review'] = 0 if reviews.empty else weeks_since_older_review
    df_unprocessed.loc[i, 'std_of_stars'] = std_stars
    df_unprocessed.loc[i, 'median_of_stars'] = median_stars
    df_unprocessed.loc[i, 'votes_per_week'] = number_votes / weeks_since_older_review
    df_unprocessed.loc[i, 'stars_linear_coef'] = regression_coefs

In [7]:
df_unprocessed.head(1)

Unnamed: 0,business_id,business_name,categories,city,full_address,display_phone,review_count,stars,price_tag,is_claimed,...,image,url,latitude,longitude,reviews_per_week,oldest_review,std_of_stars,median_of_stars,votes_per_week,stars_linear_coef
0,f3a6c1e4-fd78-4675-82a1-5dfd68c1e1ab,La Pizza Nostra,[Pizza],Tirana County,"Rruga Perlat Rexhepi, Tirana Albania",+355 69 309 9999,15.0,5.0,1.0,True,...,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",https://www.tripadvisor.com/Restaurant_Review-...,41.318115,19.817093,0.180516,49.857143,0.0,5.0,0.020057,-0.0


Now we will make a dictionary of restaurant categories that will help us calculate relative quantities. The keys are the restaurant categories and the values are the indices of the restaurants that belong in each category.

In [8]:
category_dict = {}
for i, business in df_unprocessed.iterrows():
    for category in business['categories']:
        try:
            category_dict[category] += [i]
        except:
            category_dict[category] = [i]

Here, we will calculate restaurant density within 1.6 km radius and other relative quantities. The z* quantities are relative to the equivalent quantities of the surrounding restaurants.

In [9]:
def distance(ilat, jlat, ilong, jlong):
    R = 6371.e3 # earth radius
    phi1 = math.radians(ilat)
    phi2 = math.radians(jlat)
    delta_phi = math.radians(jlat-ilat)
    delta_lambda = math.radians(jlong-ilong)
    a = math.sin(delta_phi/2)**2 + math.cos(phi1)*math.cos(phi2) * math.sin(delta_lambda/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = R*c # meters
    return d

In [10]:
df_unprocessed['business_density'] = np.nan
df_unprocessed['business_similar_density'] = np.nan
df_unprocessed['zprice_all'] = np.nan
df_unprocessed['zreview_count_all'] = np.nan
df_unprocessed['zreview_per_week_all'] = np.nan
df_unprocessed['zstar_all'] = np.nan
df_unprocessed['is_chain'] = np.nan

for i, business in df_unprocessed.iterrows():
    price_all = []
    review_count_all = []
    review_per_week_all = []
    star_all = []
    density_similar = []
    density_all = []
    
    ilong = business['longitude']
    ilat = business['latitude']
    
    for category in business['categories']:
        for business_category in category_dict[category]:
            jlong = df_unprocessed.loc[business_category]['longitude']
            jlat = df_unprocessed.loc[business_category]['latitude']
            dist = distance(ilat,jlat,ilong,jlong)
            # if less than 1600 meters or 1.6 km (include self)
            # certain restaurants will be double counted if they belong to the same categories but
            # that's ok as it adds a weight for very similar restaurants
            if dist <= 1609.34: # meters
                density_similar += [business_category]
    
    # Loop over all resturants to calculate the "all" properties
    for j, business_j in df_unprocessed.iterrows():
        jlong = business_j['longitude']
        jlat = business_j['latitude']
        dist = distance(ilat, jlat, ilong, jlong)
        
        if dist <= 1609.34:
            price_all += [business_j['price_tag']]
            review_count_all += [business_j['review_count']]
            review_per_week_all += [business_j['reviews_per_week']]
            star_all += [business_j['stars']]
            density_all += [j]
            
    # price and stars are not divided by standard deviation because it is often equal to 0
    df_unprocessed.loc[i,'zprice_all'] = (business['price_tag']-np.nanmean(price_all))/4.
    df_unprocessed.loc[i,'zreview_count_all'] = (business['review_count']-np.nanmean(review_count_all))/np.nanstd(review_count_all)
    df_unprocessed.loc[i,'zreview_per_week_all'] = (business['reviews_per_week']-np.nanmean(review_per_week_all))/np.nanstd(review_per_week_all)
    df_unprocessed.loc[i,'zstar_all'] = (business['stars']-np.nanmean(star_all))/5.
        
    df_unprocessed.loc[i,'business_density'] = len(density_all)
    df_unprocessed.loc[i, 'business_similar_density'] = len(density_similar)
    # True if there are more than one
    df_unprocessed.loc[i,'is_chain'] = (len(df_unprocessed[df_unprocessed['business_name'] == business['business_name']]) > 1)    

  df_unprocessed.loc[i,'zreview_count_all'] = (business['review_count']-np.nanmean(review_count_all))/np.nanstd(review_count_all)
  df_unprocessed.loc[i,'zreview_per_week_all'] = (business['reviews_per_week']-np.nanmean(review_per_week_all))/np.nanstd(review_per_week_all)


In [11]:
df_unprocessed.columns

Index(['business_id', 'business_name', 'categories', 'city', 'full_address',
       'display_phone', 'review_count', 'stars', 'price_tag', 'is_claimed',
       'is_closed', 'image', 'url', 'latitude', 'longitude',
       'reviews_per_week', 'oldest_review', 'std_of_stars', 'median_of_stars',
       'votes_per_week', 'stars_linear_coef', 'business_density',
       'business_similar_density', 'zprice_all', 'zreview_count_all',
       'zreview_per_week_all', 'zstar_all', 'is_chain'],
      dtype='object')

In [12]:
df_processed = df_unprocessed

In [13]:
df_processed[df_processed['is_chain'] == True][['business_name', 'is_chain']]

Unnamed: 0,business_name,is_chain
4,Casa della Pasta,True
20,ANDİ BAR,True
27,Kodrinat,True
33,Pizzeria Roma,True
43,Monet Restaurant,True
...,...,...
1504,Family,True
1521,Bella Vista,True
1545,Island Restaurant & Pizza,True
1572,Island Restaurant & Pizza,True


#### Save
Saving the datasets with all the features

In [14]:
os.makedirs('../datasets/processed_clean', exist_ok=True)
df_processed.to_pickle('../datasets/processed_clean/business_data_processed.pkl')