In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [6]:
from cluster_fns import def_cluster
from ipynb.fs.defs.census_and_restaurant_data_withEDA import adjoin_census

In [7]:
rd_train = pd.read_csv("/Users/dominiquekemp/Documents/GitHub/'Will It Restaurant?'/data_files/train_data_with_census.csv")
boroughs = np.sort(rd_train.borough.unique())

#adjoin census data
census_data = pd.read_csv("../'Will It Restaurant?'/data_files/census_data.csv")
rd_train = adjoin_census(rd_train, census_data)
#adjoin cuisine specificity
rd_train = pd.concat([rd_train, pd.get_dummies(rd_train.type)], axis = 1)

#load test data set and add cuisine specificity
rd_test = pd.read_csv("../'Will It Restaurant?'/data_files/test_data.csv")
rd_test = pd.concat([rd_test, pd.get_dummies(rd_test.type)], axis = 1)

#ensure that cuisine columns match for training and test set
miss_cuis = pd.DataFrame({cuis: len(rd_train)*[False] for cuis in rd_test.type.unique() if cuis not in rd_train.type.unique()})
rd_train = pd.concat([rd_train, miss_cuis], axis = 1)

miss_cuis = pd.DataFrame({cuis: len(rd_test)*[False] for cuis in rd_train.type.unique() if cuis not in rd_test.type.unique()})
rd_test = pd.concat([rd_test, miss_cuis], axis = 1)


In [8]:
#modify 'range' column for rd_test appropriately
rd_test['range'] = rd_test.range.fillna(0)
rd_test['range'] = rd_test.range.astype(int).astype(str)


In [9]:
rd_test.range.unique()

array(['1', '2', '3', '0', '4'], dtype=object)

In [10]:
rd_train.range.unique()

array(['2', '1', '3', '0', '2 stars', '3 stars', '4'], dtype=object)

In [11]:
rd_train

Unnamed: 0,name,site,subtypes,category,type,phone,full_address,borough,street,postal_code,...,Rice restaurant,Family restaurant,Jewish restaurant,Fusion restaurant,Tunisian restaurant,Indian takeaway,Taiwanese restaurant,Argentinian restaurant,Juice shop,Laotian restaurant
0,The Legends of Seafood and Soul Food,True,"Seafood restaurant, Soul food restaurant",restaurants,Seafood restaurant,True,"5352 Woodland Ave, Philadelphia, PA 19143",Kingsessing,5352 Woodland Ave,19143.0,...,False,False,False,False,False,False,False,False,False,False
1,Fette Sau,True,"Restaurant, American restaurant, Barbecue rest...",restaurants,Restaurant,True,"1208 Frankford Ave, Philadelphia, PA 19125",Fishtown,1208 Frankford Ave,19125.0,...,False,False,False,False,False,False,False,False,False,False
2,Cafe Liz,False,"Portuguese restaurant, Bar",restaurants,Portuguese restaurant,True,"5437 N Lawrence St, Philadelphia, PA 19120",Olney-Oak Lane,5437 N Lawrence St,19120.0,...,False,False,False,False,False,False,False,False,False,False
3,Plaza Pizza and Grill,True,"Pizza restaurant, Breakfast restaurant, Cheese...",restaurants,Pizza restaurant,True,"1614 Cecil B. Moore Ave, Philadelphia, PA 19121",North Philadelphia,1614 Cecil B. Moore Ave,19121.0,...,False,False,False,False,False,False,False,False,False,False
4,Little Fish BYOB,True,Seafood restaurant,restaurants,Seafood restaurant,True,"746 S 6th St, Philadelphia, PA 19147",Bella Vista,746 S 6th St,19147.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2383,China House,False,"Chinese restaurant, Delivery Restaurant",restaurants,Chinese restaurant,True,"2015, 7141 Ogontz Ave, Philadelphia, PA 19138",Olney-Oak Lane,"2015, 7141 Ogontz Ave",19138.0,...,False,False,False,False,False,False,False,False,False,False
2384,Sabrina's Cafe,True,"Restaurant, Breakfast restaurant, Brunch resta...",restaurants,Restaurant,True,"1804 Callowhill St, Philadelphia, PA 19130",Center City,1804 Callowhill St,19130.0,...,False,False,False,False,False,False,False,False,False,False
2385,Dagwood's Pub,True,Bar & grill,restaurants,Bar & grill,True,"4625 Linden Ave, Philadelphia, PA 19114",Northeast Philadelphia,4625 Linden Ave,19114.0,...,False,False,False,False,False,False,False,False,False,False
2386,Malooga,True,"Middle Eastern restaurant, Falafel restaurant,...",restaurants,Middle Eastern restaurant,True,"134 Chestnut St, Philadelphia, PA 19106",Center City East,134 Chestnut St,19106.0,...,False,False,False,False,False,False,False,False,False,False


In [12]:
rd_test

Unnamed: 0,name,site,subtypes,category,type,phone,full_address,borough,street,postal_code,...,Amusement center,Ecuadorian restaurant,Traditional American restaurant,Fried chicken takeaway,Grill,Cuban restaurant,Mobile caterer,Event venue,Southeast Asian restaurant,Bubble tea store
0,Wawa,True,"Fast food restaurant, Breakfast restaurant, Ca...",restaurants,Fast food restaurant,True,"6858 Rising Sun Ave, Philadelphia, PA 19111",Northeast Philadelphia,6858 Rising Sun Ave,19111.0,...,False,False,False,False,False,False,False,False,False,False
1,Otto's Taproom & Grille,True,"Restaurant, Bar",restaurants,Restaurant,True,"1216 N 29th St, Philadelphia, PA 19121",North Philadelphia,1216 N 29th St,19121.0,...,False,False,False,False,False,False,False,False,False,False
2,Akira Ramen & Hibachi,True,Japanese restaurant,restaurants,Japanese restaurant,True,"7628 Castor Ave, Philadelphia, PA 19152",Northeast Philadelphia,7628 Castor Ave,19152.0,...,False,False,False,False,False,False,False,False,False,False
3,Standard Tap,True,"Restaurant, Cocktail bar, New American restaur...",bars,Restaurant,True,"901 N 2nd St, Philadelphia, PA 19123",Northern Liberties,901 N 2nd St,19123.0,...,False,False,False,False,False,False,False,False,False,False
4,The Sidecar Bar & Grille,True,"Bar & grill, American restaurant, Bar, Gastrop...",restaurants,Bar & grill,True,"2201 Christian St, Philadelphia, PA 19146",Southwest Center City,2201 Christian St,19146.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592,NAM Vietnamese Kitchen,True,Vietnamese restaurant,restaurants,Vietnamese restaurant,True,"3816 Chestnut St Unit 8 Unit 8, Philadelphia, ...",West Philadelphia,3816 Chestnut St Unit 8 Unit 8,19104.0,...,False,False,False,False,False,False,False,False,False,False
593,All Star Pizza,True,"Pizza restaurant, Delivery Restaurant",restaurants,Pizza restaurant,True,"2345 Street Rd, Bensalem, PA 19020",Bensalem,2345 Street Rd,19020.0,...,False,False,False,False,False,False,False,False,False,False
594,Bill's Breakfast & Lunch,True,"Restaurant, Breakfast restaurant, Brunch resta...",restaurants,Restaurant,True,"1312 Sansom St, Philadelphia, PA 19107",Center City East,1312 Sansom St,19107.0,...,False,False,False,False,False,False,False,False,False,False
595,Krush Lounge,True,Bar & grill,restaurants,Bar & grill,True,"6414 Rising Sun Ave, Philadelphia, PA 19111",Northeast Philadelphia,6414 Rising Sun Ave,19111.0,...,False,False,False,False,False,False,False,False,False,False


## Arrangement of Initial Model-Cluster Pairings

In [13]:
all_models = pd.read_csv("../'Will It Restaurant?'/initial_modeling/model_comparison(all_rests).csv")

In [14]:
all_models.rename(columns = {'Best k': 'Best k (geospatial, interact., cuis)', 'Best k.1': 'Best k (geospatial)', 'Best k.2': 'Best k (geospatial, bor)', 'Min. RMSE': 'CV Avg. RMSE.2'}, inplace = True)
all_models.index = boroughs
all_models.drop(labels = ['Northeast Philadelphia', 'West Philadelphia'], axis = 0, inplace = True)
all_models

Unnamed: 0,"Best k (geospatial, interact., cuis)",CV Avg. RMSE,Best k (geospatial),CV Avg. RMSE.1,"Best k (geospatial, bor)",CV Avg. RMSE.2,Number of Restaurants
Allegheny West,5,0.272305,48,0.345464,8,0.360631,22
Bella Vista,5,0.265526,23,0.190454,6,0.208949,33
Bensalem,4,0.467987,15,0.476567,4,0.705114,6
Center City,5,0.351571,18,0.327539,11,0.322179,128
Center City East,5,0.347562,47,0.339561,10,0.336999,212
Chestnut Hill,2,0.271802,38,0.301278,1,0.276455,22
East Germantown,5,0.409866,10,0.497392,20,0.534539,26
East Kensington,5,0.228961,3,0.189985,8,0.200562,10
East Passyunk Crossing,5,0.401182,7,0.304203,4,0.268043,26
Elmwood Park,2,0.387735,31,0.375245,26,0.372289,35


In [15]:
#an algorithm that selects the best performing k-neighbors model among k < 21 from above or model with smallest k 
models = {}
i=0
for bor in all_models.index:
    k_s = all_models.iloc[i, [0,2,4]].values
    k_s = np.array([[k_s[0], 0], [k_s[1], 2], [k_s[2], 4]])
    k_val = k_s[:, 0]
    k_val = k_val[k_val != 0]
    #pick smallest error among k's less than 21
    if np.any(k_val <=20):
        k_s = [k for k in k_s if (k[0] <=20) & (k[0] > 0)]
        rm_ind = np.array([k[1] for k in k_s]) + 1
        drow = all_models.iloc[i, rm_ind].values
        min = np.argmin(drow)
        loc = int(rm_ind[min] - 1)
        k = all_models.iat[i, loc]


        models[bor] = [all_models.columns[loc], 'k= ' + str(k)]
    #pick smallest k if all k's exceed 20
    else:
        min = np.argmin(k_val)
        loc = int(k_s[min][1])
        k = k_val[min]

        models[bor] = [all_models.columns[loc], 'k= ' + str(k)]
        
    i +=1

models


{'Allegheny West': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'Bella Vista': ['Best k (geospatial, bor)', 'k= 6'],
 'Bensalem': ['Best k (geospatial, interact., cuis)', 'k= 4'],
 'Center City': ['Best k (geospatial, bor)', 'k= 11'],
 'Center City East': ['Best k (geospatial, bor)', 'k= 10'],
 'Chestnut Hill': ['Best k (geospatial, interact., cuis)', 'k= 2'],
 'East Germantown': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'East Kensington': ['Best k (geospatial)', 'k= 3'],
 'East Passyunk Crossing': ['Best k (geospatial, bor)', 'k= 4'],
 'Elmwood Park': ['Best k (geospatial, interact., cuis)', 'k= 2'],
 'Essington': ['Best k (geospatial, bor)', 'k= 3'],
 'Fairhill': ['Best k (geospatial, interact., cuis)', 'k= 4'],
 'Fairmount': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'Feltonville': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'Fishtown': ['Best k (geospatial, bor)', 'k= 10'],
 'Fox Chase': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'Frankford': [

## Arrangement of Advanced Model-Cluster Pairings

In [None]:
#read in the cluster dataframe
from ast import literal_eval

def safest_literal_eval(x):
    #replace problematic substrings with acceptable strings arising from outside parameters of data
    if pd.isna(x):
        return x
    else:
        x = x.replace("np.float64", "")
        x = x.replace("(", "")
        x = x.replace(")", "")
        x = x.replace("-inf", "-123456789010") #to handle infinite endpoints, we assign a dummy value lying outside range of data
        x = x.replace("inf", '123456789010')
        #method for rendering list and number form of such styled as strings
        x = literal_eval(x)
        for j in range(len(x)):
            if x[j][0] == -123456789010:
                x[j][0] = -np.inf
            if x[j][1] == 123456789010:
                x[j][1] = np.inf

    

        return x
    
def safe_literal_eval(x):
    if pd.isna(x):
        return x
    else: 
        return literal_eval(x)
    

cluster_df = pd.read_csv("../'Will It Restaurant?'/cluster_modeling/clusters.csv")
cluster_df['origin'] = cluster_df['origin'].apply(literal_eval)
cluster_df['good_model'] = cluster_df['good_model'].apply(literal_eval)
cluster_df['bad_model'] = cluster_df['bad_model'].apply(safe_literal_eval)
cluster_df['class_features'] = cluster_df['class_features'].apply(safe_literal_eval)

cluster_df['0'] = cluster_df['0'].apply(safest_literal_eval)
cluster_df['1'] = cluster_df['1'].apply(safest_literal_eval)
cluster_df['2'] = cluster_df['2'].apply(safest_literal_eval)
cluster_df['3'] = cluster_df['3'].apply(safest_literal_eval)
cluster_df['4'] = cluster_df['4'].apply(safest_literal_eval)
cluster_df['5'] = cluster_df['5'].apply(safest_literal_eval)

In [17]:
cluster_df

Unnamed: 0,origin,good_model,bad_model,class_features,0,1,2,3,4,5
0,"(West Philadelphia, 1)","(8-neighbors (gen.), [reviews, longitude, lati...","(Linear Regression, [latitude, longitude, revi...","[latitude, longitude, photos_count, reviews, M...","[[-inf, inf], [-inf, -75.21855545043945], [-in...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[39.95686149597168, inf], [-75.21855545043945...","[[-inf, inf], [-75.19183731079102, inf], [-inf..."
1,"(West Philadelphia, 2)","(17-neighbors (bor.), [reviews, Foreign Born I...","(Linear Regression, [reviews, latitude, longit...","[latitude, longitude, photos_count, reviews, M...","[[-inf, inf], [-inf, inf], [-inf, inf], [-inf,...","[[-inf, inf], [-inf, -75.25553512573242], [-in...","[[-inf, 39.94808769226074], [-75.2555351257324...","[[39.94808769226074, inf], [-75.25553512573242...",,
2,"(West Philadelphia, 0)","(Random Forest 5 (bor.), [latitude, longitude,...",,,,,,,,
3,"(Northeast Philadelphia, 1)","(6-neighbors (gen.), [latitude, longitude, rev...","(5-neighbors, [Foreign Born Immigrant %, revie...","[latitude, longitude, reviews, photos_count, M...","[[-inf, inf], [-inf, -75.03024673461914], [-in...","[[-inf, inf], [-inf, -75.03024673461914], [238...","[[-inf, inf], [-75.03024673461914, inf], [-inf...","[[-inf, inf], [-inf, inf], [-inf, inf], [-inf,...","[[-inf, inf], [-inf, -75.06171417236328], [-in...","[[40.040489196777344, inf], [-inf, inf], [-inf..."
4,"(Northeast Philadelphia, 2-3)","(14-neighbors (bor.), [latitude, longitude, re...",,,,,,,,
5,"(Northeast Philadelphia, 0)","(5-neighbors (bor.), [latitude, longitude, rev...",,,,,,,,


## Protocol for Application of Models and Clustering

### Initial Clusters

In [18]:
#organize collection of feature subsets
cuis = rd_train.type.unique().tolist()
feats_gic = ['latitude', 'longitude', 'reviews', 'photos_count'] + cuis
feats_g = feats_gic[:2]

In [19]:
#instantiate the models to be used
def knn_pipe_var(k):
    knn_pipe = Pipeline([('scale', StandardScaler()), ('knn', KNeighborsRegressor(k))])
    return knn_pipe

mlr = LinearRegression()

def rf_var(feats):
    rf = RandomForestRegressor(max_depth = 5, random_state= 50492, max_features = len(feats))
    return rf


In [20]:
models

{'Allegheny West': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'Bella Vista': ['Best k (geospatial, bor)', 'k= 6'],
 'Bensalem': ['Best k (geospatial, interact., cuis)', 'k= 4'],
 'Center City': ['Best k (geospatial, bor)', 'k= 11'],
 'Center City East': ['Best k (geospatial, bor)', 'k= 10'],
 'Chestnut Hill': ['Best k (geospatial, interact., cuis)', 'k= 2'],
 'East Germantown': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'East Kensington': ['Best k (geospatial)', 'k= 3'],
 'East Passyunk Crossing': ['Best k (geospatial, bor)', 'k= 4'],
 'Elmwood Park': ['Best k (geospatial, interact., cuis)', 'k= 2'],
 'Essington': ['Best k (geospatial, bor)', 'k= 3'],
 'Fairhill': ['Best k (geospatial, interact., cuis)', 'k= 4'],
 'Fairmount': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'Feltonville': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'Fishtown': ['Best k (geospatial, bor)', 'k= 10'],
 'Fox Chase': ['Best k (geospatial, interact., cuis)', 'k= 5'],
 'Frankford': [

In [21]:
#for later convenience, we write the application of borough-specific regression as a class

class K_Bor():
    """A class for applying borough-restricted k-neighbors regression."""

    def __init__(self, k):
       
        self.bor = None
        self.k = k
        self.model = knn_pipe_var(k)
    "An 'amb_data' dataset with a 'borough' column and a 'rating' column must be used"
    def fit(self, data, bor, feats):
        
        self.bor = bor
        data = data[data.borough == bor]
        
        self.model.fit(data[feats], data.rating)
        
            
    def predict(self, data, bor, feats):
        data = data[data.borough == bor]
        return self.model.predict(data[feats])
    

        
        

In [22]:
#map each model description to the model
def give_model(list_str):
    mod_descr = list_str[0]
    k_str = list_str[1]
    k = int(float(k_str.replace("k= ", "")))
    knn_pipe = knn_pipe_var(k)
    if mod_descr == 'Best k (geospatial, interact., cuis)':
         return knn_pipe, feats_gic
    if mod_descr == 'Best k (geospatial, bor)':
         return K_Bor(k), feats_g
    if mod_descr == 'Best k (geospatial)':
         return knn_pipe, feats_g
    
models = {key: give_model(value) for key, value in models.items()}

### Advanced Clusters

In [23]:
cluster_df

Unnamed: 0,origin,good_model,bad_model,class_features,0,1,2,3,4,5
0,"(West Philadelphia, 1)","(8-neighbors (gen.), [reviews, longitude, lati...","(Linear Regression, [latitude, longitude, revi...","[latitude, longitude, photos_count, reviews, M...","[[-inf, inf], [-inf, -75.21855545043945], [-in...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[39.95686149597168, inf], [-75.21855545043945...","[[-inf, inf], [-75.19183731079102, inf], [-inf..."
1,"(West Philadelphia, 2)","(17-neighbors (bor.), [reviews, Foreign Born I...","(Linear Regression, [reviews, latitude, longit...","[latitude, longitude, photos_count, reviews, M...","[[-inf, inf], [-inf, inf], [-inf, inf], [-inf,...","[[-inf, inf], [-inf, -75.25553512573242], [-in...","[[-inf, 39.94808769226074], [-75.2555351257324...","[[39.94808769226074, inf], [-75.25553512573242...",,
2,"(West Philadelphia, 0)","(Random Forest 5 (bor.), [latitude, longitude,...",,,,,,,,
3,"(Northeast Philadelphia, 1)","(6-neighbors (gen.), [latitude, longitude, rev...","(5-neighbors, [Foreign Born Immigrant %, revie...","[latitude, longitude, reviews, photos_count, M...","[[-inf, inf], [-inf, -75.03024673461914], [-in...","[[-inf, inf], [-inf, -75.03024673461914], [238...","[[-inf, inf], [-75.03024673461914, inf], [-inf...","[[-inf, inf], [-inf, inf], [-inf, inf], [-inf,...","[[-inf, inf], [-inf, -75.06171417236328], [-in...","[[40.040489196777344, inf], [-inf, inf], [-inf..."
4,"(Northeast Philadelphia, 2-3)","(14-neighbors (bor.), [latitude, longitude, re...",,,,,,,,
5,"(Northeast Philadelphia, 0)","(5-neighbors (bor.), [latitude, longitude, rev...",,,,,,,,


We will go one by one through 'cluster_df' when training and predicting these models in the next section. We do however here need to write the methods for producing the clusters within the training and test sets.

In [24]:
def prod_clusters(data, subsets_rowdf, clus_feats):
    subsets = {i: subsets_rowdf.iloc[i] for i in range(len(subsets_rowdf))}
    bad = def_cluster(data, subsets, clus_feats)
    index_to_remove = set(bad.index.tolist())
    index = set(data.index.tolist())
    index = index.difference(index_to_remove)
    index = [*index]
    good = data.loc[index]
    return good, bad

## Full Training and Testing of Models

In [25]:
models

{'Allegheny West': (Pipeline(steps=[('scale', StandardScaler()), ('knn', KNeighborsRegressor())]),
  ['latitude',
   'longitude',
   'reviews',
   'photos_count',
   'Seafood restaurant',
   'Restaurant',
   'Portuguese restaurant',
   'Pizza restaurant',
   'Chinese restaurant',
   'American restaurant',
   'Barbecue restaurant',
   'Chicken restaurant',
   'Indian restaurant',
   'New American restaurant',
   'French restaurant',
   'Bar & grill',
   'Indonesian restaurant',
   'Kosher restaurant',
   'Mexican restaurant',
   'Diner',
   'Asian restaurant',
   'Bakery',
   'Coffee shop',
   'Chicken wings restaurant',
   'Entertainment agency',
   'Jamaican restaurant',
   'Sandwich shop',
   'Shabu-shabu restaurant',
   'Brunch restaurant',
   'Korean restaurant',
   'Thai restaurant',
   'Caribbean restaurant',
   'Latin American restaurant',
   'Fast food restaurant',
   'Italian restaurant',
   'Breakfast restaurant',
   'Creperie',
   'Southern restaurant (US)',
   'Salvadoran r

In [26]:
cluster_df

Unnamed: 0,origin,good_model,bad_model,class_features,0,1,2,3,4,5
0,"(West Philadelphia, 1)","(8-neighbors (gen.), [reviews, longitude, lati...","(Linear Regression, [latitude, longitude, revi...","[latitude, longitude, photos_count, reviews, M...","[[-inf, inf], [-inf, -75.21855545043945], [-in...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[39.95686149597168, inf], [-75.21855545043945...","[[-inf, inf], [-75.19183731079102, inf], [-inf..."
1,"(West Philadelphia, 2)","(17-neighbors (bor.), [reviews, Foreign Born I...","(Linear Regression, [reviews, latitude, longit...","[latitude, longitude, photos_count, reviews, M...","[[-inf, inf], [-inf, inf], [-inf, inf], [-inf,...","[[-inf, inf], [-inf, -75.25553512573242], [-in...","[[-inf, 39.94808769226074], [-75.2555351257324...","[[39.94808769226074, inf], [-75.25553512573242...",,
2,"(West Philadelphia, 0)","(Random Forest 5 (bor.), [latitude, longitude,...",,,,,,,,
3,"(Northeast Philadelphia, 1)","(6-neighbors (gen.), [latitude, longitude, rev...","(5-neighbors, [Foreign Born Immigrant %, revie...","[latitude, longitude, reviews, photos_count, M...","[[-inf, inf], [-inf, -75.03024673461914], [-in...","[[-inf, inf], [-inf, -75.03024673461914], [238...","[[-inf, inf], [-75.03024673461914, inf], [-inf...","[[-inf, inf], [-inf, inf], [-inf, inf], [-inf,...","[[-inf, inf], [-inf, -75.06171417236328], [-in...","[[40.040489196777344, inf], [-inf, inf], [-inf..."
4,"(Northeast Philadelphia, 2-3)","(14-neighbors (bor.), [latitude, longitude, re...",,,,,,,,
5,"(Northeast Philadelphia, 0)","(5-neighbors (bor.), [latitude, longitude, rev...",,,,,,,,


In [27]:
#fit models to training set
for bor in models.keys():
    
    model = models[bor][0]
    feats = models[bor][1]
    if isinstance(model, K_Bor):
        model.fit(rd_train, bor, feats)
    else:
        model.fit(rd_train[feats], rd_train.rating)


    #training on west philly
    wp = rd_train[rd_train.borough == 'West Philadelphia']
    subset_row = cluster_df.iloc[0, -6:]
    clust_feats = cluster_df.loc[0, 'class_features']
    good_feats = cluster_df.loc[0, 'good_model'][1]
    bad_feats = cluster_df.loc[0, 'bad_model'][1]
    wp_one_good, wp_one_bad = prod_clusters(wp[wp.range == '1'], subset_row, clust_feats)
    wp_one_good = pd.concat([wp_one_good, wp.loc[wp.range == '3']], axis = 0)
    knn_wp_one = KNeighborsRegressor(8)
    mlr_wp_one = LinearRegression()
    knn_wp_one.fit(rd_train[good_feats], rd_train.rating)
    mlr_wp_one.fit(wp_one_bad[bad_feats], wp_one_bad.rating)

    subset_row = cluster_df.iloc[1, -6:-2]
    clust_feats = cluster_df.loc[1, 'class_features']
    good_feats = cluster_df.loc[1, 'good_model'][1]
    bad_feats = cluster_df.loc[1, 'bad_model'][1]
    wp_two_good, wp_two_bad = prod_clusters(wp[wp.range == '2'], subset_row, clust_feats)
    knn_wp_two = KNeighborsRegressor(17)
    mlr_wp_two = LinearRegression()
    knn_wp_two.fit(wp[wp.range == '2'][good_feats], wp[wp.range == '2'].rating)
    mlr_wp_two.fit(wp_two_bad[bad_feats], wp_two_bad.rating)

    wp_zer = wp[wp.range == '0']
    feats = cluster_df.loc[2, 'good_model'][1]
    rf = rf_var(feats)
    rf.fit(wp_zer[feats], wp_zer.rating)

    #training on northeast philly
    ne = rd_train[rd_train.borough == 'Northeast Philadelphia']
    subset_row = cluster_df.iloc[3, -6:]
    clust_feats = cluster_df.loc[3, 'class_features']
    good_feats = cluster_df.loc[3, 'good_model'][1]
    bad_feats = cluster_df.loc[3, 'bad_model'][1]
    ne_one_good, ne_one_bad = prod_clusters(ne[ne.range == '1'], subset_row, clust_feats)
    knn_ne_one_g = KNeighborsRegressor(6)
    knn_ne_one_b = KNeighborsRegressor(5)
    knn_ne_one_g.fit(rd_train[good_feats], rd_train.rating)
    knn_ne_one_b.fit(ne_one_bad[bad_feats], ne_one_bad.rating)

    ne_two = rd_train[(rd_train.borough == 'Northeast Philadelphia') & ((rd_train.range == '2') | (rd_train.range == "3"))]
    ne_zer = rd_train[(rd_train.borough == 'Northeast Philadelphia') & (rd_train.range == '0')]
    knn_ne_two = KNeighborsRegressor(14)
    knn_ne_zer = KNeighborsRegressor(5)
    two_feats = cluster_df.loc[4, 'good_model'][1]
    zer_feats = cluster_df.loc[5, 'good_model'][1]
    knn_ne_two.fit(ne_two[two_feats], ne_two.rating)
    knn_ne_zer.fit(ne_zer[zer_feats], ne_zer.rating)




In [28]:
#test whether rd_test contains any price levels outside of what's considered for NE and W Philly
spec_bors = rd_test[(rd_test.borough == 'Northeast Philadelphia') | (rd_test.borough == 'West Philadelphia')]

spec_bors.range.unique()

array(['1', '2', '0', '3'], dtype=object)

In [29]:
#obtain the predictions on the test set
preds = np.array([])
ratings = np.array([])
bor_rmses = {}
boroughs = rd_test.borough.unique()
for bor in boroughs:
    if bor not in ['Northeast Philadelphia', 'West Philadelphia']: 
        model = models[bor][0]
        feats = models[bor][1]
        bor_df = rd_test[rd_test.borough == bor]
        ratings = np.concatenate((ratings, bor_df.rating))
        if isinstance(model, K_Bor):
            bor_preds = model.predict(rd_test, bor, feats)
            preds = np.concatenate((preds, bor_preds))
            bor_rmses[bor] = root_mean_squared_error(bor_preds, bor_df.rating)
        else:
            bor_preds= model.predict(bor_df[feats])
            preds = np.concatenate((preds, bor_preds))
            bor_rmses[bor] = root_mean_squared_error(bor_preds, bor_df.rating)
    
    #prediction on west philly
    wp = rd_test[rd_test.borough == 'West Philadelphia']
    subset_row = cluster_df.iloc[0, -6:]
    clust_feats = cluster_df.loc[0, 'class_features']
    good_feats = cluster_df.loc[0, 'good_model'][1]
    bad_feats = cluster_df.loc[0, 'bad_model'][1]
    wp_one_good, wp_one_bad = prod_clusters(wp[wp.range == '1'], subset_row, clust_feats)
    wp_one_good = pd.concat([wp_one_good, wp.loc[wp.range == '3']], axis = 0)
   
    wp_preds = knn_wp_one.predict(wp_one_good[good_feats])
    lin_preds = mlr_wp_one.predict(wp_one_bad[bad_feats])
    for j in range(len(lin_preds)):
        if lin_preds[j] > 5.0:
            lin_preds[j] = 5.0
    wp_preds = np.concatenate((wp_preds, lin_preds))
    
    subset_row = cluster_df.iloc[1, -6:-2]
    clust_feats = cluster_df.loc[1, 'class_features']
    good_feats = cluster_df.loc[1, 'good_model'][1]
    bad_feats = cluster_df.loc[1, 'bad_model'][1]
    wp_two_good, wp_two_bad = prod_clusters(wp[wp.range == '2'], subset_row, clust_feats)

    wp_preds = np.concatenate((wp_preds, knn_wp_two.predict(wp_two_good[good_feats])))
    lin_preds = mlr_wp_two.predict(wp_two_bad[bad_feats])
    for j in range(len(lin_preds)): #we ensure that the linear model only returns outputs within the allowed range
        if lin_preds[j] > 5.0:
            lin_preds[j] = 5.0
    wp_preds = np.concatenate((wp_preds, lin_preds))

    wp_zer = wp[wp.range == '0']
    feats = cluster_df.loc[2, 'good_model'][1]
    wp_preds = np.concatenate((wp_preds, rf.predict(wp_zer[feats])))
    preds = np.concatenate((preds, wp_preds))
    wp_ratings_arr = np.concatenate((wp_one_good.rating, wp_one_bad.rating, wp_two_good.rating, wp_two_bad.rating,
                                wp_zer.rating))
    ratings = np.concatenate((ratings, wp_ratings_arr))
    bor_rmses['West Philadelphia'] = root_mean_squared_error(wp_preds, wp_ratings_arr)

    #prediction on ne philly
    ne = rd_test[rd_test.borough == 'Northeast Philadelphia']
    subset_row = cluster_df.iloc[3, -6:]
    clust_feats = cluster_df.loc[3, 'class_features']
    good_feats = cluster_df.loc[3, 'good_model'][1]
    bad_feats = cluster_df.loc[3, 'bad_model'][1]
    ne_one_good, ne_one_bad = prod_clusters(ne[ne.range == '1'], subset_row, clust_feats)
  
    ne_preds = knn_ne_one_g.predict(ne_one_good[good_feats])
    ne_preds = np.concatenate((ne_preds, knn_ne_one_b.predict(ne_one_bad[bad_feats])))

    ne_two = rd_test[(rd_test.borough == 'Northeast Philadelphia') & ((rd_test.range == '2') | (rd_test.range == "3"))]
    ne_zer = rd_test[(rd_test.borough == 'Northeast Philadelphia') & (rd_test.range == '0')]
    two_feats = cluster_df.loc[4, 'good_model'][1]
    zer_feats = cluster_df.loc[5, 'good_model'][1]

    ne_preds = np.concatenate((ne_preds, knn_ne_two.predict(ne_two[two_feats])))
    ne_preds = np.concatenate((ne_preds, knn_ne_zer.predict(ne_zer[zer_feats])))
    preds = np.concatenate((preds, ne_preds))
    ne_ratings_arr = np.concatenate((ne_one_good.rating, ne_one_bad.rating, ne_two.rating, ne_zer.rating))
    ratings = np.concatenate((ratings, ne_ratings_arr))
    bor_rmses['Northeast Philadelphia'] = root_mean_squared_error(ne_preds, ne_ratings_arr)

rmse = root_mean_squared_error(ratings, preds)
print(f"The RMSE for our models' predictions on the test set is {rmse: .3f}.")

    

The RMSE for our models' predictions on the test set is  0.731.


In [30]:
dict(sorted(bor_rmses.items()))

{'Allegheny West': 0.6331403214664716,
 'Bella Vista': 0.3754626775356267,
 'Bensalem': 0.251246890528022,
 'Center City': 0.34715490623872775,
 'Center City East': 0.3347006547676399,
 'Chestnut Hill': 0.33837848631377254,
 'East Germantown': 0.7465029582437479,
 'East Kensington': 0.7039570693980957,
 'East Passyunk Crossing': 0.18850918886280893,
 'Elmwood Park': 0.23213980461973538,
 'Essington': 0.19002923751652234,
 'Fairhill': 0.20891585227869464,
 'Fairmount': 0.42308391602612366,
 'Feltonville': 0.39070449191172596,
 'Fishtown': 0.3901676168402104,
 'Fox Chase': 0.23966643486312403,
 'Frankford': 0.5395235993779273,
 'Germantown': 0.22684429314693677,
 'Harrowgate': 0.26419689627245846,
 'Kingsessing': 0.4496912521077347,
 'Lester': 0.10000000000000053,
 'Lower Moyamensing': 0.22079402165819573,
 'Mount Airy': 0.3320240955111541,
 'Newbold': 0.3560196623783581,
 'Nicetown–Tioga': 0.3577708763999663,
 'North Philadelphia': 0.7022696125389822,
 'North Philadelphia East': 0.43949

## Assessment

It appears that our deconstruction strategy failed spectacularly, at least for West Philly. We briefly examine whether worse results were also returned by the deconstruction on NE Philly.

In [31]:
cluster_df

Unnamed: 0,origin,good_model,bad_model,class_features,0,1,2,3,4,5
0,"(West Philadelphia, 1)","(8-neighbors (gen.), [reviews, longitude, lati...","(Linear Regression, [latitude, longitude, revi...","[latitude, longitude, photos_count, reviews, M...","[[-inf, inf], [-inf, -75.21855545043945], [-in...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[-inf, 39.95686149597168], [-75.2185554504394...","[[39.95686149597168, inf], [-75.21855545043945...","[[-inf, inf], [-75.19183731079102, inf], [-inf..."
1,"(West Philadelphia, 2)","(17-neighbors (bor.), [reviews, Foreign Born I...","(Linear Regression, [reviews, latitude, longit...","[latitude, longitude, photos_count, reviews, M...","[[-inf, inf], [-inf, inf], [-inf, inf], [-inf,...","[[-inf, inf], [-inf, -75.25553512573242], [-in...","[[-inf, 39.94808769226074], [-75.2555351257324...","[[39.94808769226074, inf], [-75.25553512573242...",,
2,"(West Philadelphia, 0)","(Random Forest 5 (bor.), [latitude, longitude,...",,,,,,,,
3,"(Northeast Philadelphia, 1)","(6-neighbors (gen.), [latitude, longitude, rev...","(5-neighbors, [Foreign Born Immigrant %, revie...","[latitude, longitude, reviews, photos_count, M...","[[-inf, inf], [-inf, -75.03024673461914], [-in...","[[-inf, inf], [-inf, -75.03024673461914], [238...","[[-inf, inf], [-75.03024673461914, inf], [-inf...","[[-inf, inf], [-inf, inf], [-inf, inf], [-inf,...","[[-inf, inf], [-inf, -75.06171417236328], [-in...","[[40.040489196777344, inf], [-inf, inf], [-inf..."
4,"(Northeast Philadelphia, 2-3)","(14-neighbors (bor.), [latitude, longitude, re...",,,,,,,,
5,"(Northeast Philadelphia, 0)","(5-neighbors (bor.), [latitude, longitude, rev...",,,,,,,,


In [32]:
knn_rmse = root_mean_squared_error(ne_one_bad.rating, knn_ne_one_g.predict(ne_one_bad[good_feats]))

knn_b_rmse = root_mean_squared_error(ne_one_bad.rating, knn_ne_one_b.predict(ne_one_bad[bad_feats]))


knn_rmse, knn_b_rmse



(0.39680436452199425, 0.5880476171195664)

Indeed the deconstruction estimate is worse here too. In hindsight, there may be a rationale for why deconstruction performed so badly on the test set, namely the sway of outliers. When determining the 'large error' set, for lack of immediate insight, we compiled from every fold of the associated cross-validation. But this runs counter to the core philosophy of cross-validation. Additionally, the idea underlying deconstruction may not be robust itself as posed. To simply assume that all sizable errors together point to a divergent, objective statistical trend ignores the effect of outliers. In hindsight, it would have been better if we had incorporated bootstrapping when using the model errors to derive a feature-defined cluster.

For the sake of completeness, we record the test results when deconstruction is not applied.

In [33]:
#fit models to training set
for bor in models.keys():
    
    model = models[bor][0]
    feats = models[bor][1]
    if isinstance(model, K_Bor):
        model.fit(rd_train, bor, feats)
    else:
        model.fit(rd_train[feats], rd_train.rating)


    #training on west philly
    wp = rd_train[rd_train.borough == 'West Philadelphia']
    
    good_feats = cluster_df.loc[0, 'good_model'][1]
    
    wp_one = wp[(wp.range == '1') | (wp.range == '3')]
    
    knn_wp_one = KNeighborsRegressor(8)
 
    knn_wp_one.fit(rd_train[good_feats], rd_train.rating)
    

    good_feats = cluster_df.loc[1, 'good_model'][1]
    
    wp_two = wp[wp.range == '2']
    knn_wp_two = KNeighborsRegressor(17)
    
    knn_wp_two.fit(wp_two[good_feats], wp_two.rating)
    

    wp_zer = wp[wp.range == '0']
    feats = cluster_df.loc[2, 'good_model'][1]
    rf = rf_var(feats)
    rf.fit(wp_zer[feats], wp_zer.rating)

    #training on northeast philly
    ne = rd_train[rd_train.borough == 'Northeast Philadelphia']
    
    good_feats = cluster_df.loc[3, 'good_model'][1]
    
    ne_one = ne[ne.range == '1']
    knn_ne_one_g = KNeighborsRegressor(6)
    
    knn_ne_one_g.fit(rd_train[good_feats], rd_train.rating)
    

    ne_two = rd_train[(rd_train.borough == 'Northeast Philadelphia') & ((rd_train.range == '2') | (rd_train.range == "3"))]
    ne_zer = rd_train[(rd_train.borough == 'Northeast Philadelphia') & (rd_train.range == '0')]
    knn_ne_two = KNeighborsRegressor(14)
    knn_ne_zer = KNeighborsRegressor(5)
    two_feats = cluster_df.loc[4, 'good_model'][1]
    zer_feats = cluster_df.loc[5, 'good_model'][1]
    knn_ne_two.fit(ne_two[two_feats], ne_two.rating)
    knn_ne_zer.fit(ne_zer[zer_feats], ne_zer.rating)




In [34]:
#obtain the predictions on the test set
preds = np.array([])
ratings = np.array([])
bor_rmses = {}
boroughs = rd_test.borough.unique()
for bor in boroughs:
    if bor not in ['Northeast Philadelphia', 'West Philadelphia']: 
        model = models[bor][0]
        feats = models[bor][1]
        bor_df = rd_test[rd_test.borough == bor]
        ratings = np.concatenate((ratings, bor_df.rating))
        if isinstance(model, K_Bor):
            bor_preds = model.predict(rd_test, bor, feats)
            preds = np.concatenate((preds, bor_preds))
            bor_rmses[bor] = root_mean_squared_error(bor_preds, bor_df.rating)
        else:
            bor_preds= model.predict(bor_df[feats])
            preds = np.concatenate((preds, bor_preds))
            bor_rmses[bor] = root_mean_squared_error(bor_preds, bor_df.rating)
    
    #prediction on west philly
    wp = rd_test[rd_test.borough == 'West Philadelphia']
    good_feats = cluster_df.loc[0, 'good_model'][1]
    wp_one = wp[(wp.range == '1') | (wp.range == '3')]
    
   
    wp_preds = knn_wp_one.predict(wp_one[good_feats])
    
    good_feats = cluster_df.loc[1, 'good_model'][1]
    wp_two = wp[wp.range == '2']

    wp_preds = np.concatenate((wp_preds, knn_wp_two.predict(wp_two[good_feats])))
    

    wp_zer = wp[wp.range == '0']
    feats = cluster_df.loc[2, 'good_model'][1]
    wp_preds = np.concatenate((wp_preds, rf.predict(wp_zer[feats])))
    preds = np.concatenate((preds, wp_preds))
    wp_ratings_arr = np.concatenate((wp_one.rating, wp_two.rating,
                                wp_zer.rating))
    ratings = np.concatenate((ratings, wp_ratings_arr))
    bor_rmses['West Philadelphia'] = root_mean_squared_error(wp_preds, wp_ratings_arr)

    #prediction on ne philly
    ne = rd_test[rd_test.borough == 'Northeast Philadelphia']
    good_feats = cluster_df.loc[3, 'good_model'][1]
    ne_one = ne[ne.range == '1']
  
    ne_preds = knn_ne_one_g.predict(ne_one[good_feats])

    ne_two = rd_test[(rd_test.borough == 'Northeast Philadelphia') & ((rd_test.range == '2') | (rd_test.range == "3"))]
    ne_zer = rd_test[(rd_test.borough == 'Northeast Philadelphia') & (rd_test.range == '0')]
    two_feats = cluster_df.loc[4, 'good_model'][1]
    zer_feats = cluster_df.loc[5, 'good_model'][1]

    ne_preds = np.concatenate((ne_preds, knn_ne_two.predict(ne_two[two_feats])))
    ne_preds = np.concatenate((ne_preds, knn_ne_zer.predict(ne_zer[zer_feats])))
    preds = np.concatenate((preds, ne_preds))
    ne_ratings_arr = np.concatenate((ne_one.rating, ne_two.rating, ne_zer.rating))
    ratings = np.concatenate((ratings, ne_ratings_arr))
    bor_rmses['Northeast Philadelphia'] = root_mean_squared_error(ne_preds, ne_ratings_arr)

rmse = root_mean_squared_error(ratings, preds)
print(f"The RMSE for our models' predictions (without deconstruction) on the test set is {rmse: .3f}.")

    

The RMSE for our models' predictions (without deconstruction) on the test set is  0.469.


In [35]:
dict(sorted(bor_rmses.items()))

{'Allegheny West': 0.6331403214664716,
 'Bella Vista': 0.3754626775356267,
 'Bensalem': 0.251246890528022,
 'Center City': 0.34715490623872775,
 'Center City East': 0.3347006547676399,
 'Chestnut Hill': 0.33837848631377254,
 'East Germantown': 0.7465029582437479,
 'East Kensington': 0.7039570693980957,
 'East Passyunk Crossing': 0.18850918886280893,
 'Elmwood Park': 0.23213980461973538,
 'Essington': 0.19002923751652234,
 'Fairhill': 0.20891585227869464,
 'Fairmount': 0.42308391602612366,
 'Feltonville': 0.39070449191172596,
 'Fishtown': 0.3901676168402104,
 'Fox Chase': 0.23966643486312403,
 'Frankford': 0.5395235993779273,
 'Germantown': 0.22684429314693677,
 'Harrowgate': 0.26419689627245846,
 'Kingsessing': 0.4496912521077347,
 'Lester': 0.10000000000000053,
 'Lower Moyamensing': 0.22079402165819573,
 'Mount Airy': 0.3320240955111541,
 'Newbold': 0.3560196623783581,
 'Nicetown–Tioga': 0.3577708763999663,
 'North Philadelphia': 0.7022696125389822,
 'North Philadelphia East': 0.43949