### Importing Needed packages

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline

### Loading data, and filtering out data with venues but in wrong neighborhoods

In [3]:
venues_with_details = pd.read_pickle("venues_with_details.pkl")
core_df = venues_with_details[['City','Neighborhood', 'SimpleCategory','TipsCount', 'LikesCount', 'Rating', 'PriceTier']]
nyc_neighs = ['Astoria','Downtown ','East Village','Financial District','Murray Hill','North Side','South Side','Upper West Side','West Village','Yorkville']
tor_neighs = ['Central Bay Street','Church and Wellesley','Commerce Court, Victoria Hotel','First Canadian Place, Underground city','Garden District, Ryerson','Harbourfront East, Union Station, Toronto Islands','Richmond, Adelaide, King','St. James Town','Stn A PO Boxes, 25 The Esplanade, Enclave of M5E','Toronto Dominion Centre, Design Exchange']
core_df = core_df[((core_df['City']=='New York') \
                 & (core_df['Neighborhood'].isin(nyc_neighs))) \
                 | ((core_df['City']=='Toronto') \
                 & (core_df['Neighborhood'].isin(tor_neighs))) ]

### Create filters for PriceTiers

In [11]:
# Filter out data where PriceTier or Rating are na
# use tilde in front of filter to negate this
yes_price_tier_filt = (core_df['Rating'].isnull()==False) & (core_df['PriceTier'].isnull()==False) & (core_df['PriceTier'] > 0)
no_price_tier_filt = (core_df['Rating'].isnull()==False) & ((core_df['PriceTier'].isnull()==True) | (core_df['PriceTier'] == 0))

price_df = core_df[yes_price_tier_filt]
no_price_df = core_df[no_price_tier_filt]

### Do we have any nulls? We know we have them for no_price_df

In [12]:
price_df[price_df.isna().any(axis=1)]

Unnamed: 0,City,Neighborhood,SimpleCategory,TipsCount,LikesCount,Rating,PriceTier


In [13]:
no_price_df[no_price_df.isna().any(axis=1)]
# 88 Prices we want to predict

Unnamed: 0,City,Neighborhood,SimpleCategory,TipsCount,LikesCount,Rating,PriceTier
97,Toronto,St. James Town,FunStore,3,19,8.4,
99,Toronto,"Stn A PO Boxes, 25 The Esplanade, Enclave of M5E",FunStore,3,19,8.4,
104,Toronto,"Toronto Dominion Centre, Design Exchange",Entertainment,44,264,7.5,
105,Toronto,"Stn A PO Boxes, 25 The Esplanade, Enclave of M5E",Entertainment,44,264,7.5,
112,Toronto,"Commerce Court, Victoria Hotel",Recreation,5,7,8.0,
...,...,...,...,...,...,...,...
677,Toronto,"Toronto Dominion Centre, Design Exchange",CheapMeal,15,70,7.6,
678,Toronto,"First Canadian Place, Underground city",CheapMeal,15,70,7.6,
682,Toronto,"Toronto Dominion Centre, Design Exchange",Entertainment,0,5,6.8,
687,Toronto,"Garden District, Ryerson",CoffeeDessert,2,7,6.9,


### Create one-hot encoding for neighborhoods

In [14]:
# Since dummies will prefix the column name, shorten string by changing the name to "Cat"
core_df.rename(columns={'SimpleCategory':'Cat'}, inplace=True)

In [15]:
dummies_df = pd.get_dummies(core_df, columns=['City','Neighborhood','Cat'], drop_first=True)

In [16]:
price_df = dummies_df[yes_price_tier_filt]
no_price_df = dummies_df[no_price_tier_filt]

### Lists of columns for multiple linear regression

In [25]:
# Do not include 'PriceTier'
columns = ['TipsCount','LikesCount','Rating','City_Toronto','Neighborhood_Central Bay Street','Neighborhood_Church and Wellesley','Neighborhood_Commerce Court, Victoria Hotel','Neighborhood_East Village','Neighborhood_Financial District','Neighborhood_First Canadian Place, Underground city','Neighborhood_Garden District, Ryerson','Neighborhood_Harbourfront East, Union Station, Toronto Islands','Neighborhood_Murray Hill','Neighborhood_North Side','Neighborhood_Richmond, Adelaide, King','Neighborhood_South Side','Neighborhood_St. James Town','Neighborhood_Stn A PO Boxes, 25 The Esplanade, Enclave of M5E','Neighborhood_Toronto Dominion Centre, Design Exchange','Neighborhood_Upper West Side','Neighborhood_West Village','Neighborhood_Yorkville','Cat_CheapMeal','Cat_CoffeeDessert','Cat_Entertainment','Cat_Ethnic Food','Cat_Fancy Food','Cat_FunStore','Cat_Recreation']

In [None]:
from sklearn import linear_model

In [26]:
regr = linear_model.LinearRegression()
x = np.asanyarray(price_df[columns])
y = np.asanyarray(price_df[['PriceTier']])
regr.fit (x, y)
# The coefficients
print ('Coefficients: ', regr.coef_)

Coefficients:  [[ 6.75136551e-04  2.41308131e-04  7.75082510e-02  3.00759356e-01
   7.39359050e-02 -1.13685256e-03  1.39490743e-01  8.52406156e-02
   4.27930889e-01  4.01063812e-02 -2.88683758e-01 -2.81365386e-02
   3.34994425e-01 -6.72610759e-02  1.69199005e-01  3.75455627e-02
   1.13179675e-01  3.91052335e-02  4.36995633e-02  2.97631441e-01
   5.58862378e-01  2.70617022e-01 -6.26050705e-01 -9.00491778e-01
   1.05027400e+00 -6.23640029e-02  6.79064749e-01  0.00000000e+00
   0.00000000e+00]]


### Make coefficients easier to read

In [29]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [47]:
coef_cols = pd.DataFrame(train_coef, columns=['Coef'])
coef_cols['Source'] = columns
coef_cols.sort_values(by=['Coef'], axis=0, ascending=False, inplace=True)
print(coef_cols)

        Coef                                             Source
24  1.050274                                  Cat_Entertainment
26  0.679065                                     Cat_Fancy Food
20  0.558862                          Neighborhood_West Village
8   0.427931                    Neighborhood_Financial District
12  0.334994                           Neighborhood_Murray Hill
3   0.300759                                       City_Toronto
19  0.297631                       Neighborhood_Upper West Side
21  0.270617                             Neighborhood_Yorkville
14  0.169199              Neighborhood_Richmond, Adelaide, King
6   0.139491        Neighborhood_Commerce Court, Victoria Hotel
16  0.113180                        Neighborhood_St. James Town
7   0.085241                          Neighborhood_East Village
2   0.077508                                             Rating
4   0.073936                    Neighborhood_Central Bay Street
18  0.043700  Neighborhood_Toronto Domin

In [30]:
arr2 = np.sort(regr.coef_, axis = -1)        
print(arr2)


[[-0.900 -0.626 -0.289 -0.067 -0.062 -0.028 -0.001 0.000 0.000 0.000
  0.001 0.038 0.039 0.040 0.044 0.074 0.078 0.085 0.113 0.139 0.169 0.271
  0.298 0.301 0.335 0.428 0.559 0.679 1.050]]


In [49]:
y_hat= regr.predict(no_price_df[columns])


### I want to add the predictions to the no price data

In [50]:
no_price_df['PriceTier'] = y_hat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_price_df['PriceTier'] = y_hat


In [54]:
price_filt = no_price_df['PriceTier'] > 3

no_price_df[price_filt].sort_values(by=['PriceTier'], ascending=False).head(25)

Unnamed: 0,TipsCount,LikesCount,Rating,PriceTier,City_Toronto,Neighborhood_Central Bay Street,Neighborhood_Church and Wellesley,"Neighborhood_Commerce Court, Victoria Hotel",Neighborhood_East Village,Neighborhood_Financial District,...,Neighborhood_Upper West Side,Neighborhood_West Village,Neighborhood_Yorkville,Cat_CheapMeal,Cat_CoffeeDessert,Cat_Entertainment,Cat_Ethnic Food,Cat_Fancy Food,Cat_FunStore,Cat_Recreation
154,273,2191,8.7,3.929128,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
155,273,2191,8.7,3.924534,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1836,300,2217,8.7,3.881795,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
153,273,2191,8.7,3.857292,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1825,275,1439,8.8,3.68493,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1859,177,844,9.2,3.568563,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1398,37,140,8.8,3.497028,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
343,79,407,8.7,3.463449,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
319,42,318,8.7,3.446701,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
331,44,222,8.6,3.417135,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
