### NOTE:  In order to create a single Python file to submit, I needed to concatenate 7 files.  This is why you will see repeated sections and imports.  I also have a number of lines commented out.  This is because in my workflow, I had already updated the data in various ways and then saved it to a pickle file. So, the flow is that each file creates pickle files which are then picked up by the next python file to resume processing.  I left in the comments to show what I needed to do at various points in the process.


### Do the basic imports 

In [3]:
!pip install beautifulsoup4



In [4]:
import pandas as pd
import numpy as np

In [5]:
import requests
from bs4 import BeautifulSoup

### Load the BeautifulSoup object with Wiki data and replace line breaks for easier parsing

In [6]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

page = requests.get(wiki_url).text
soup = BeautifulSoup(page, "html.parser")

# REPLACE <BR> WITH PIPE FOR PARSING
delimiter = '|'                           # unambiguous string
for line_break in soup.findAll('br'):       # loop through line break tags
    line_break.replaceWith(delimiter) 

### Create a function to clean the neighborhood data

In [7]:
def special_replace(neigh):
    result = neigh.replace('(','')
    result = result.replace(')','')
    result = result.strip()
    return result

### Create a dataframe to hold data. Parse out the data, omitting cases where no borough is listed. Sort the dataframe by borough and postal code.

In [8]:
data = {"PostalCode":"XXX", "Borough":"XXX", "Neighborhood":"XXX"}
tor_neigh = pd.DataFrame(columns=["PostalCode","Borough","Neighborhood"], index = [0]) 
neigh = ""

tbl = soup.find_all('table')[0]

use_anchor_method = False
row_marker = 0

# Big Parsing Loop - Rows, then Cells
for row in tbl.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    # each td has a p with the postal code, a span which can say Not assigned, or a span with <a> elements,
    # the first is the borough, all the rest of the items after split are the neighborhoods
    for column in columns:
        code = column.find_all('p')[0].get_text()[:3]
        span = column.find_all('span')[0]
        a_count = len(span.find_all('a')) 
        if span.get_text() == "Not assigned":
            continue
        else: 
            special_neighs = span.get_text().replace('/','|').split('|')
            borough = special_neighs[0]
            for n in range(1, len(special_neighs)):
                newneigh = special_neighs[n]
                if neigh == np.nan or neigh == None or neigh == "":
                    neigh = special_replace(newneigh)
                else:
                    neigh += ", " + special_replace(newneigh)
                        
            new_row = {'PostalCode':code,'Borough':borough,'Neighborhood':neigh}
            tor_neigh = tor_neigh.append(new_row, ignore_index=True)
        
        # Clear these values so we can't contaminate the next pass
        borough = ""
        neigh = ""
        newneigh = ""
        column_marker += 1

tor_neigh.drop(labels=0,axis=0,inplace=True)
tor_neigh.sort_values(by=["Borough", "PostalCode"], inplace=True)
tor_neigh = tor_neigh.reset_index(drop=True)
tor_neigh[1:51]

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M4P,Central Toronto,Davisville North
2,M4R,Central Toronto,North Toronto West
3,M4S,Central Toronto,Davisville
4,M4T,Central Toronto,"Moore Park, Summerhill East"
5,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest..."
6,M5N,Central Toronto,Roselawn
7,M5P,Central Toronto,Forest Hill North & West
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville"
9,M4W,Downtown Toronto,Rosedale
10,M4X,Downtown Toronto,"St. James Town, Cabbagetown"


In [9]:
tor_neigh.shape

(103, 3)

<div>
<b>I DID NOT FIND THIS TO BE TRUE</b><br>
More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, 
you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. 
These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11  
in the above table.
</div>

In [9]:
df_pc_count = tor_neigh.groupby('PostalCode').size().reset_index()
df_pc_count.columns = ["PostalCode","CodeCount"]
df_pc_count[df_pc_count["CodeCount"] > 1]

Unnamed: 0,PostalCode,CodeCount


In [None]:
tor_neigh[tor_neigh["PostalCode"]=="M5A"]

In [1]:
tor_neigh.tail()

NameError: name 'tor_neigh' is not defined

### Save the dataframe for use in another file

In [10]:
tor_neigh.to_pickle('toronto_neighborhoods_step_1_V2.pkl') 

### Import libraries

In [1]:
import pandas as pd
import numpy as np

### Load data from files. We saved the toronto neighborhoods in the first step of this project.


In [2]:
tor_neigh = pd.read_pickle('toronto_neighborhoods_step_1_V2.pkl')
loglatdata = pd.read_csv('Geospatial_Coordinates.csv')

In [3]:
loglatdata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the data together

In [4]:
tor_longlat = tor_neigh.merge(loglatdata,how="left",left_on=["PostalCode"],right_on=["Postal Code"])

### Drop the redundant column

In [5]:
tor_longlat.drop(columns=['Postal Code'], axis=1)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.728020,-79.388790
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.388790
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.383160
...,...,...,...,...,...
98,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
99,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512
100,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",43.691116,-79.476013
101,M6N,York,"Runnymede, The Junction North",43.673185,-79.487262


### Do we need to remove any data?

In [6]:
tor_longlat.isnull().values.any()

False

In [7]:
tor_longlat.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,M4N,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,M4P,43.712751,-79.390197
2,M4R,Central Toronto,North Toronto West,M4R,43.715383,-79.405678
3,M4S,Central Toronto,Davisville,M4S,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",M4T,43.689574,-79.38316


In [34]:
tor_longlat.to_pickle('tor_longlat_data.pkl') 


### Doing all my imports


In [2]:
import pandas as pd
import numpy as np

In [3]:
import requests

In [4]:
from sklearn.cluster import KMeans

In [5]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [6]:
#!conda install -c conda-forge geopy
!pip install geopy



In [7]:
from geopy.geocoders import Nominatim

In [8]:
#!conda install -c conda-forge folium=0.5.0 --yes
!pip install folium
import folium # map rendering library




### Get saved Toronto data


In [9]:
tor_longlat = pd.read_pickle('tor_longlat_data.pkl')

### Create map of Toronto neighborhoods


In [10]:
ave_long = tor_longlat["Longitude"].mean()
ave_lat = tor_longlat["Latitude"].mean()

### Map of neigborhoods


In [11]:
map_toronto = folium.Map(location=[ave_lat, ave_long], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(tor_longlat['Latitude'], tor_longlat['Longitude'], tor_longlat['Borough'], tor_longlat['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Foursquare credentials - MODIFIED - FOR PRIVACY


In [12]:
CLIENT_ID = 'BOIJJKYCZMDIUBCUSMDOXANZCVNFNKWTNS3KJZXEB4PL9999' # your Foursquare ID
CLIENT_SECRET = '5EJRIL4HGEV0XWVIORAKGQV225IKNKGLULRAKZGPVHB49999' # your Foursquare Secret
VERSION = '20210715' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BOIJJKYCZMDIUBCUSMDOXANZCVNFNKWTNS3KJZXEB4PLHQHI
CLIENT_SECRET:5EJRIL4HGEV0XWVIORAKGQV225IKNKGLULRAKZGPVHB4RXS0


### Method to extract data from Explore endpoint

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['id'], # GM - I need this Id so I can get details (premium call)
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue_Id',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
# This is my dataframe for gathering detailed venue data
venue_details = pd.DataFrame(columns = ["Venue_Id",
                        "Name",
                        "CheckinsCount",
                        "UsersCount",
                        "TipsCount",
                        "VisitsCount",
                        "LikesCount",
                        "Rating",
                        "PriceTier"])

### Method to extract data from Venues Premium Endpoint

In [15]:
def getVenueDetails(venue_id):
   
    url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}'.format(
            venue_id,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION
            )
    
    # make the GET request
    result = requests.get(url).json()["response"]['venue']
    
    stats = result["stats"]
    pricetier = np.nan
    checkinsCount = np.nan
    usersCount = np.nan
    tipCount = np.nan
    visitsCount = np.nan
    
    if "price" in result:
        price = result["price"]
        if "tier" in price:
            pricetier = price["tier"]
    
    if "checkinsCount" in stats:
        checkinsCount = stats["checkinsCount"]
    if "usersCount" in stats:
        usersCount = stats["usersCount"]
    if "tipCount" in stats:
        tipCount = stats["tipCount"]
    else:
        tipCount = 0
    if "visitsCount" in stats:
        visitsCount = stats["visitsCount"]

        
    venue_details.loc[len(venue_details.index)] = [result["id"],
                                                  result["name"],
                                                  checkinsCount,
                                                  usersCount,
                                                  tipCount,
                                                  visitsCount,
                                                  result["likes"]["count"],
                                                  result["rating"],
                                                  pricetier]


In [16]:
getVenueDetails('4ad69511f964a520e40721e3')
venue_details.head(10)

Unnamed: 0,Venue_Id,Name,CheckinsCount,UsersCount,TipsCount,VisitsCount,LikesCount,Rating,PriceTier
0,4ad69511f964a520e40721e3,The Keg Steakhouse + Bar - York Street,,,79,,269,8.7,2


In [17]:
# DO NOT RUN THIS CODE CASUALLY - YOU CAN LOSE DATA!
# venue_details.to_pickle("venue_details.pkl")

In [None]:
getVenueDetails('4ae6ea6ef964a52082a721e3')
venue_details.head()

In [None]:
names = tor_longlat["Neighborhood"]
longitude = tor_longlat["Longitude"]
latitude = tor_longlat["Latitude"]
toronto_venues = getNearbyVenues(names, latitude, longitude)
toronto_venues.head()

<H1>Save Toronto Venues Data</H1>

In [None]:
# DO NOT RUN THIS CODE CASUALLY - YOU CAN LOSE DATA!
# toronto_venues.to_pickle("toronto_venues_data.pkl")

In [None]:
# I want to see some really popular places to test the details results
toronto_popular = toronto_venues[(toronto_venues["Neighborhood"] == "Commerce Court, Victoria Hotel") & (toronto_venues["Venue Category"].str.contains("Restaurant"))]
toronto_popular.head()

### I believe there were <b>way too many venue categories</b> in the FourSquare data, so I manually created a much simplified set of 10 categories which I want to cluster on.


In [None]:
simple_categories = pd.read_csv('FourSquare Fun Categories - V5.csv')
simple_categories.head()

### Method to determine if a venue is Fun 

In [None]:
fun_places = ['Entertainment','Recreation','CheapMeal','CoffeeDessert','Ethnic Food','Fancy Food','Bar','FunStore']

def get_simple_category(venue_type):
    #print(venue_type)
    venue_type = str.strip(venue_type)
    column_name = [col for col in simple_categories.columns if (simple_categories[col] == venue_type).any()]
    if len(column_name) == 0:
        result="NOT FOUND"
        print("NOT FOUND: " + venue_type)
    elif column_name[0] in fun_places:
        result = column_name[0]
    else:
        result="NOT FUN!"
    return result
    

### Assign the simple categories to the main venues dataframe


In [None]:
toronto_venues["SimpleCategory"] = toronto_venues["Venue Category"].apply(get_simple_category)
toronto_venues.head()

In [None]:
toronto_venues.to_pickle("toronto_venues_fun.pkl")

In [None]:
# Remove all the venues that are not fun
toronto_venues_fun = toronto_venues[toronto_venues["SimpleCategory"] != "NOT FUN!"]
toronto_venues_fun.to_pickle("toronto_venues_fun.pkl")

### One hot encoding

In [None]:

# toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot = pd.get_dummies(toronto_venues_fun[['SimpleCategory']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues_fun['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

In [None]:
#toronto_onehot = toronto_onehot.drop(['NOT FOUND','NOT FUN!'],axis=1)
toronto_onehot = toronto_onehot.drop(['NOT FUN!'],axis=1)

In [None]:
toronto_onehot = toronto_onehot.drop(['NOT FOUND'],axis=1)

### Getting Neighborhoods with the Most Fun Venues

In [None]:
toronto_sum = toronto_onehot.groupby('Neighborhood').sum().reset_index()
#df.sum(axis=1)
toronto_sum["TotalFun"] = toronto_sum.sum(axis=1)
#df.sort_values(by=['c1'])
toronto_sum = toronto_sum.sort_values(by=["TotalFun"],ascending=False).reset_index()
toronto_sum.head(10)

In [None]:
# DO NOT RUN THIS CODE CASUALLY - YOU CAN LOSE DATA!
# toronto_sum.to_pickle("toronto_most_fun.pkl")

<h1>----------End of Initial Toronto Fun Data----------</h1>

<h1>--------Getting New York Data--------</h1>

In [None]:
ny_long_lat = pd.read_pickle("nyc_long_lat.pkl")

In [None]:
ny_long_lat.head()

In [None]:
names = ny_long_lat["Neighborhood"]
longitude = ny_long_lat["Longitude"]
latitude = ny_long_lat["Latitude"]
nyc_venues = getNearbyVenues(names, latitude, longitude)
nyc_venues.head()

In [None]:
nyc_venues["SimpleCategory"] = nyc_venues["Venue Category"].apply(get_simple_category)
nyc_venues.head()

In [None]:
nyc_venues[nyc_venues["Venue Category"].str.contains("Hotel")][0:50]

In [None]:
nyc_venues.head()

In [None]:
# Remove all not found and not fun - not found categories are all not fun - I checked
nyc_venues = nyc_venues[ (nyc_venues['SimpleCategory']!='NOT FOUND') | (nyc_venues['SimpleCategory']!='NOT FUN!') ]

### TO PREVENT ERRORS, I OFTEN NEED TO COMMENT OUT THIS CODE

In [None]:
# Save all the nyc venues data which is fun

# DO NOT RUN THIS CODE CASUALLY - YOU CAN LOSE DATA!
# nyc_venues.to_pickle("nyc venues only fun.pkl")

In [None]:
# get names of indexes for which
# column Age has value 21
#index_names = nyc_venues[ nyc_venues['SimpleCategory'] == 'NOT FOUND' ].index
  
# drop these row indexes
# from dataFrame
#nyc_venues.drop(index_names, inplace = True)

In [None]:
# one hot encoding
nyc_onehot = pd.get_dummies(nyc_venues[['SimpleCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
nyc_onehot['Neighborhood'] = nyc_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [nyc_onehot.columns[-1]] + list(nyc_onehot.columns[:-1])
nyc_onehot = nyc_onehot[fixed_columns]

nyc_onehot.head()
#nyc_onehot = nyc_onehot.drop(['NOT FUN!'],axis=1)
nyc_sum = nyc_onehot.groupby('Neighborhood').sum().reset_index()
nyc_sum["TotalFun"] = nyc_sum.sum(axis=1)
nyc_sum = nyc_sum.sort_values(by=["TotalFun"],ascending=False).reset_index()

# Save all the fun data for nyc

# DO NOT RUN THIS CODE CASUALLY - YOU CAN LOSE DATA!
# nyc_sum.to_pickle("nyc_most_fun.pkl")

nyc_sum.head(30)

<h1>Fun Popular Focus Idea</h1>
<p>
I'm thinking I will take the top 20 neighborhoods based on the Fun Total for both Toronto and NYC
Then I will take all the fun places in those neighborhoods and get data for number of reviews and total score.
Then I will come up with a formula which assigns a fun weighting to each simple category and then does a calculation involving the number of reviews and the total score - and then I will aggregate these back together and get a total fun result.
</p>

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head(30)

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    # T - Transpose index and columns. Reflect the DataFrame over its main diagonal by writing rows as columns and vice-versa. The property T is an accessor to the method
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

<div>
<b>NOTE TO PEER REVIEWER: THIS WILL LOOK MUCH DIFFERENT BECAUSE I AM USING MY OWN LIST OF SIMPLIFIED VENUE CATEGORIES. I BELIEVE THIS IS A SUBSTANTIAL IMPROVEMENT FOR CLUSTERING.</b>
</div>

In [196]:
# create map
map_clusters = folium.Map(location=[ave_lat, ave_long], zoom_start=11)

# Cluster Names
cluster_types = ['Stores, Recreation, Food','Cheap food, ethnic restaurants','Recreation, Travel, Stores','Ethnic Food, Coffee, Dessert Places', 'Recreation, Travel, Miscellaneous']

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + cluster_types[cluster], parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<h1>This is really about getting the data together, step 2</h1>

In [4]:
import requests
import pandas as pd
import numpy as np


In [40]:
CLIENT_ID = 'BOIJJKYCZMDIUBCUSMDOXANZCVNFNKWTNS3KJZXEB4PLHQHI' # your Foursquare ID
CLIENT_SECRET = '5EJRIL4HGEV0XWVIORAKGQV225IKNKGLULRAKZGPVHB4RXS0' # your Foursquare Secret
VERSION = '20210715' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BOIJJKYCZMDIUBCUSMDOXANZCVNFNKWTNS3KJZXEB4PLHQHI
CLIENT_SECRET:5EJRIL4HGEV0XWVIORAKGQV225IKNKGLULRAKZGPVHB4RXS0


In [6]:
# Get rid of checkins users and vists
# toronto_onehot = toronto_onehot.drop(['NOT FOUND'],axis=1)
#venue_details = venue_details.drop(['CheckinsCount', 'UsersCount', 'VisitsCount'], axis=1)
#venue_details.to_pickle("venue_details.pkl")

In [161]:
# I need to keep adding details to my pickle file and then remember to save again when I'm done
# BE CAREFUL!! tHIS IS READING NOT WRITING!!!!
# venue_details = pd.read_pickle("venue_details.pkl")

In [175]:
venue_details.to_csv("venue_details.csv")

In [22]:
# This is my dataframe for gathering detailed venue data
# venue_details = pd.DataFrame(columns = ["Venue_Id",
#                        "Name",
#                        "TipsCount",
#                        "LikesCount",
#                        "Rating",
#                        "RatingSignals",
#                        "PriceTier"])

In [176]:
venue_details.to_pickle("venue_details.pkl")

In [177]:
def getVenueDetails(venue_id):
    # Don't run this again if we already have the data, since these are premium calls
    
    if(venue_details['Venue_Id'] == venue_id).any():
        return "WASTE" 
    
    url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}'.format(
            venue_id,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION
            )
    
    # make the GET request
    result = requests.get(url).json()
    
    if "meta" in result:
        if "errorType" in result["meta"]:
            errorMessage = result["meta"]["errorType"]
            # print("VENUE_ID: " + venue_id + " - WE GOT THIS ERROR MESSAGE: " + errorMessage)
            return "QUOTA"
    else:
        print("meta not in result")
        
    try:
        result = result["response"]['venue']
    except Exception:
        print("Non-Excessive Requests Exception with venue_id: " + venue_id + " - REMOVE VENUE")
        return "ERROR"
    
    stats = result["stats"]
    pricetier = 0
    rating = 0
    ratingSignals = 0
    tipCount = 0
    
    # add ratingSignals
    if "price" in result:
        price = result["price"]
        if "tier" in price:
            pricetier = price["tier"]
    if "tipCount" in stats:
        tipCount = stats["tipCount"]
    if "rating" in result:
        rating = result["rating"]
    if "ratingSignals" in result:
        ratingSignals = result["ratingSignals"]

        
    venue_details.loc[len(venue_details.index)] = [result["id"],
                                                  result["name"],
                                                  tipCount,
                                                  result["likes"]["count"],
                                                  rating,
                                                  ratingSignals,
                                                  pricetier,
                                                  tipCount + (result["likes"]["count"] * 2) + ratingSignals]
    return "SUCCESS"

In [12]:
# I want a number which helps show how popular and fun a place is - giving x2 for LIKES
# df['DataFrame Column'] = df['DataFrame Column'].fillna(0)
#venue_details["TipsCount"] = venue_details["TipsCount"].fillna(0);
#venue_details["LikesCount"] = venue_details["LikesCount"].fillna(0);
#venue_details["RatingSignals"] = venue_details["RatingSignals"].fillna(0);


In [13]:
#venue_details["VoteSum"] = (venue_details["TipsCount"]) + (venue_details["LikesCount"] * 2) + (venue_details["RatingSignals"])

In [14]:
#venue_details.sort_values(by="VoteSum", ascending=False)[0:60]

In [15]:
# Restore a bunch of dataframes

# All the toronto venues
toronto_venues = pd.read_pickle("toronto_venues_data.pkl")
# Only the fun toronto venues
toronto_venues_fun = pd.read_pickle("toronto_venues_fun.pkl")
# Shows the total fun score across neighborhoods
toronto_sum = pd.read_pickle("toronto_most_fun.pkl")
# For NYC, I only have the fun for now
nyc_venues_fun = pd.read_pickle("nyc venues only fun.pkl")
# For NYC, total fun scores across neighborhoods
nyc_sum= pd.read_pickle("nyc_most_fun.pkl")

In [173]:
# Populate venue data by looping through most popular neighborhoods in order of popularity
def loadVenueData(aggdataframe, venuedataframe):
    detailcounter = 0

    for index, row in aggdataframe[0:10].iterrows():
        print("Index: " + str(index))
        neighborhood = aggdataframe.iloc[index]["Neighborhood"]
        print(neighborhood)
        print('------------------------------------------')
        
        for index, row in venuedataframe[venuedataframe["Neighborhood"] == neighborhood].iterrows():
            if detailcounter > 500:
                print("WE HAVE REACHED OUR GOAL")
                return

            # Maybe this iloc approach was wrong!!!
            #venue_id = venuedataframe.iloc[index]["Venue_Id"]
            #neighborhood = venuedataframe.iloc[index]["Neighborhood"]
            #print(venue_id + ", " + neighborhood)
            venue_id = row["Venue_Id"]
            neighborhood = row["Neighborhood"]
            #print(venue_id + ", " + neighborhood)
            
            detailSuccess = getVenueDetails(venue_id)
            # WASTE, QUOTA, ERROR, SUCCESS
            
            if(detailSuccess == "SUCCESS"):
                print(str(detailcounter) + ': ' + venue_id)
                # For now, I am limiting tries even if nothing works - PUT THIS BACK UNDER SUCCESS
                detailcounter +=1
                
            elif detailSuccess == "QUOTA":
                print("WE HAVE REACHED OUR QUOTA")
                return

In [174]:
loadVenueData(toronto_sum, toronto_venues_fun)
#loadVenueData(nyc_sum, nyc_venues_fun)

Index: 0
First Canadian Place, Underground city
------------------------------------------
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
0: 50468014e4b01c18bb731df8
Would be a wasted API call
Would be a wasted

Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
29: 4bacf405f964a520881d3be3

109: 4b2d27b4f964a520a6cf24e3
110: 594ff53c2be42528bcc1bdb7
111: 55457219498ea16d16550d3d
112: 4d9b85a6913a236a733fa108
113: 4b919f56f964a520a4ca33e3
114: 53f5e2fd498ee389ab52b446
115: 4afca145f964a5208e2422e3
116: 4b746c9df964a520c1db2de3
117: 4ad88725f964a5200d1221e3
118: 4ad4c061f964a520abf720e3
119: 4ba6b463f964a520416a39e3
120: 54121f6a498e0aefc0da0ac9
121: 4bedf8b5e24d20a17b567214
122: 4bb942a7cf2fc9b6596ca002
123: 4af36863f964a52053ed21e3
124: 4ab17387f964a520866920e3
125: 4b654efaf964a520aeec2ae3
126: 58a0962975e13747e99b1fa9
127: 52423fb211d22dbd0eea431b
128: 52d4578e498e5273f7099477
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
Would be a wasted API call
129: 5acd155ea879216dfece538d
Would be a wasted API call
Would be a wasted API call
Would be a wasted A

In [150]:
venue_details.shape

(1481, 8)

In [127]:
# For now, try always updating pkl file
venue_details.to_pickle("venue_details.pkl")

In [137]:
# I want to filter down all my data now to the top 10 neighborhoods
top_tor_neigh = toronto_sum["Neighborhood"][0:10]
top_nyc_neigh = nyc_sum["Neighborhood"][0:10]
print(top_tor_neigh)
print(top_nyc_neigh)

0               First Canadian Place, Underground city
1                       Commerce Court, Victoria Hotel
2    Harbourfront East, Union Station, Toronto Islands
3             Toronto Dominion Centre, Design Exchange
4     Stn A PO Boxes, 25 The Esplanade, Enclave of M5E
5                             Richmond, Adelaide, King
6                             Garden District, Ryerson
7                                 Church and Wellesley
8                                       St. James Town
9                                   Central Bay Street
Name: Neighborhood, dtype: object
0           Murray Hill
1            South Side
2               Astoria
3          East Village
4            North Side
5             Yorkville
6          West Village
7       Upper West Side
8    Financial District
9              Downtown
Name: Neighborhood, dtype: object


In [152]:
all_top_venues["Neighborhood"].unique()

array(['Church and Wellesley', 'Garden District, Ryerson',
       'St. James Town', 'Central Bay Street', 'Richmond, Adelaide, King',
       'Harbourfront East, Union Station, Toronto Islands',
       'Toronto Dominion Centre, Design Exchange',
       'Commerce Court, Victoria Hotel',
       'Stn A PO Boxes, 25 The Esplanade, Enclave of M5E',
       'First Canadian Place, Underground city', 'Downtown', 'North Side',
       'South Side', 'Yorkville', 'Upper West Side', 'Murray Hill',
       'East Village', 'West Village', 'Financial District', 'Astoria'],
      dtype=object)

<h1>This is where I limit all the fun venues data to the Top 10 Neighborhoods</h1>

In [140]:
# df[df['col1'].isin(['a', 'c', 'h'])]
all_top_venues = all_venues_fun[all_venues_fun["Neighborhood"].isin(top_tor_neigh) | all_venues_fun["Neighborhood"].isin(top_nyc_neigh)]


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue_Id,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory,City
123,Church and Wellesley,43.66586,-79.38316,5bd2379cbcbf7a0039a2d7b9,Storm Crow Manor,43.66684,-79.381593,Theme Restaurant,Ethnic Food,Toronto
124,Church and Wellesley,43.66586,-79.38316,51a11515498ef97667980b8c,DanceLifeX Centre,43.666956,-79.385297,Dance Studio,Recreation,Toronto
125,Church and Wellesley,43.66586,-79.38316,59ecf741da2e006b9e11a3d3,The Alley,43.665922,-79.385567,Bubble Tea Shop,CoffeeDessert,Toronto
126,Church and Wellesley,43.66586,-79.38316,5d9399e49b61d90008bac7b0,Bar Volo,43.665462,-79.385692,Beer Bar,Bar,Toronto
127,Church and Wellesley,43.66586,-79.38316,4df4456052b100c2d7fdca23,Smith,43.666927,-79.381421,Breakfast Spot,CheapMeal,Toronto
128,Church and Wellesley,43.66586,-79.38316,4fb593c0e4b05a76e2ab2951,Como En Casa,43.66516,-79.384796,Mexican Restaurant,Ethnic Food,Toronto
129,Church and Wellesley,43.66586,-79.38316,4b7af787f964a520f9482fe3,Glad Day Bookshop,43.665271,-79.380785,Bookstore,FunStore,Toronto
130,Church and Wellesley,43.66586,-79.38316,5966b520fc9e94307406dafe,Si Lom,43.66501,-79.380683,Thai Restaurant,Ethnic Food,Toronto
131,Church and Wellesley,43.66586,-79.38316,4caf69ae9b34199c9a2ab763,Fabarnak,43.666377,-79.380964,Restaurant,CheapMeal,Toronto
132,Church and Wellesley,43.66586,-79.38316,534c20fa498ee7bc010834ce,Sansotei Ramen 三草亭,43.666735,-79.385353,Ramen Restaurant,Ethnic Food,Toronto


In [151]:
all_top_venues[0:50]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue_Id,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory,City
123,Church and Wellesley,43.66586,-79.38316,5bd2379cbcbf7a0039a2d7b9,Storm Crow Manor,43.66684,-79.381593,Theme Restaurant,Ethnic Food,Toronto
124,Church and Wellesley,43.66586,-79.38316,51a11515498ef97667980b8c,DanceLifeX Centre,43.666956,-79.385297,Dance Studio,Recreation,Toronto
125,Church and Wellesley,43.66586,-79.38316,59ecf741da2e006b9e11a3d3,The Alley,43.665922,-79.385567,Bubble Tea Shop,CoffeeDessert,Toronto
126,Church and Wellesley,43.66586,-79.38316,5d9399e49b61d90008bac7b0,Bar Volo,43.665462,-79.385692,Beer Bar,Bar,Toronto
127,Church and Wellesley,43.66586,-79.38316,4df4456052b100c2d7fdca23,Smith,43.666927,-79.381421,Breakfast Spot,CheapMeal,Toronto
128,Church and Wellesley,43.66586,-79.38316,4fb593c0e4b05a76e2ab2951,Como En Casa,43.66516,-79.384796,Mexican Restaurant,Ethnic Food,Toronto
129,Church and Wellesley,43.66586,-79.38316,4b7af787f964a520f9482fe3,Glad Day Bookshop,43.665271,-79.380785,Bookstore,FunStore,Toronto
130,Church and Wellesley,43.66586,-79.38316,5966b520fc9e94307406dafe,Si Lom,43.66501,-79.380683,Thai Restaurant,Ethnic Food,Toronto
131,Church and Wellesley,43.66586,-79.38316,4caf69ae9b34199c9a2ab763,Fabarnak,43.666377,-79.380964,Restaurant,CheapMeal,Toronto
132,Church and Wellesley,43.66586,-79.38316,534c20fa498ee7bc010834ce,Sansotei Ramen 三草亭,43.666735,-79.385353,Ramen Restaurant,Ethnic Food,Toronto


In [141]:
# 8-16-2021 - all_top_venues is all the nyc and tor venue data filtered to the top 10 neighs in each city
all_top_venues.to_pickle("all_top_venues.pkl")

In [65]:
# ADD CITY COLUMNS TO EACH OF THE CITY DATAFRAMES
toronto_venues_fun['City'] = 'Toronto'
nyc_venues_fun['City'] = 'New York'

In [108]:
#Combine the venues data from toronto and nyc so I can merge the venue_details
all_venues_fun = toronto_venues_fun.append(nyc_venues_fun, ignore_index=True)
all_venues_fun.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue_Id,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory,City
0,Lawrence Park,43.72802,-79.38879,50e6da19e4b0d8a78a0e9794,Lawrence Park Ravine,43.726963,-79.394382,Park,Recreation,Toronto
1,Lawrence Park,43.72802,-79.38879,5082ef77e4b0a7491cf7b022,Zodiac Swim School,43.728532,-79.38286,Swim School,Recreation,Toronto
2,Davisville North,43.712751,-79.390197,4ba011c2f964a5204a5737e3,Sherwood Park,43.716551,-79.387776,Park,Recreation,Toronto
3,Davisville North,43.712751,-79.390197,4adb2fd3f964a520c42421e3,Homeway Restaurant & Brunch,43.712641,-79.391557,Breakfast Spot,CheapMeal,Toronto
4,Davisville North,43.712751,-79.390197,4b0b3691f964a520c62e23e3,Subway,43.708474,-79.390674,Sandwich Place,CheapMeal,Toronto


In [109]:
all_venues_fun.to_pickle("all_venues_fun.pkl")

<h1>Merge all venues with all venue details now
BUT - I think this should be focused on the top 10 neighborhoods for Tor and Nyc!</h1>

In [130]:

all_venues_and_details = all_venues_fun.merge(venue_details,how="left", on="Venue_Id")

In [135]:
all_venues_and_details[1000:1040] #[1370:1415]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue_Id,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory,City,Name,TipsCount,LikesCount,Rating,RatingSignals,PriceTier,VoteSum
1000,"First Canadian Place, Underground city",43.648429,-79.38228,51ae5eaa498e5c629f21f431,Olly Fresco's,43.646912,-79.379597,Deli / Bodega,CheapMeal,Toronto,Olly Fresco's,4.0,18.0,7.8,23.0,1.0,63.0
1001,"First Canadian Place, Underground city",43.648429,-79.38228,5a4fdf56772fbc5e9fa73c7f,Chotto Matte,43.646473,-79.378782,Japanese Restaurant,Ethnic Food,Toronto,Chotto Matte,2.0,13.0,8.2,16.0,2.0,44.0
1002,"First Canadian Place, Underground city",43.648429,-79.38228,5214e7c111d2a83379eae21f,The Chase,43.650952,-79.379422,New American Restaurant,Fancy Food,Toronto,The Chase,59.0,145.0,8.2,212.0,3.0,561.0
1003,"First Canadian Place, Underground city",43.648429,-79.38228,4ad4c05ef964a520d8f620e3,Hockey Hall Of Fame (Hockey Hall of Fame),43.646974,-79.377323,Museum,Entertainment,Toronto,Hockey Hall Of Fame (Hockey Hall of Fame),79.0,407.0,8.7,501.0,,1394.0
1004,"First Canadian Place, Underground city",43.648429,-79.38228,4e048b98315168be7fd7ee4d,Estiatorio Volos,43.650329,-79.384533,Greek Restaurant,Ethnic Food,Toronto,,,,,,,
1005,"First Canadian Place, Underground city",43.648429,-79.38228,56df0683498e7d0423e91b5b,iQ Food Co. (First Canadian Place),43.648357,-79.382192,Salad Place,Ethnic Food,Toronto,iQ Food Co. (First Canadian Place),1.0,7.0,7.4,9.0,1.0,24.0
1006,"First Canadian Place, Underground city",43.648429,-79.38228,4b5ca7d8f964a5207c3c29e3,Beerbistro,43.649419,-79.377237,Gastropub,Bar,Toronto,Beerbistro,128.0,323.0,8.6,478.0,2.0,1252.0
1007,"First Canadian Place, Underground city",43.648429,-79.38228,4ba90db8f964a5203f073ae3,Sweet Lulu,43.650557,-79.381175,Asian Restaurant,Ethnic Food,Toronto,Sweet Lulu,11.0,17.0,7.6,28.0,2.0,73.0
1008,"First Canadian Place, Underground city",43.648429,-79.38228,563d2f2dcd10bcf27ae37c3b,Pilot Coffee Roasters,43.645018,-79.380415,Coffee Shop,CoffeeDessert,Toronto,Pilot Coffee Roasters,12.0,50.0,8.4,69.0,1.0,181.0
1009,"First Canadian Place, Underground city",43.648429,-79.38228,5a5a59c5a423620ec1dafd41,Pi Co.,43.648651,-79.385874,Pizza Place,Ethnic Food,Toronto,,,,,,,


<H1>This is where I create the concept of a general Score for each venue</H1>

In [118]:
# all_venues_and_details.sort_values(by="VoteSum", ascending=False)[0:50]
# all_venues_and_details["Score"] = all_venues_and_details["VoteSum"] * all_venues_and_details["Rating"]
all_venues_and_details.sort_values(by="Score", ascending=False)[50:100]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue_Id,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory,City,Name,TipsCount,LikesCount,Rating,RatingSignals,PriceTier,VoteSum,Score
5606,West Village,40.734434,-74.00618,543c7c63498e0a3393da47a2,Via Carota,40.733052,-74.003573,Italian Restaurant,Ethnic Food,New York,Via Carota,161.0,636.0,9.3,812.0,3.0,2245.0,20878.5
4765,Upper West Side,40.787658,-73.977059,3fd66200f964a5203fea1ee3,Good Enough to Eat,40.785979,-73.972764,Breakfast Spot,CheapMeal,New York,Good Enough to Eat,314.0,645.0,7.8,1058.0,2.0,2662.0,20763.6
5603,West Village,40.734434,-74.00618,45379ccef964a520c13b1fe3,The Little Owl,40.732441,-74.005424,American Restaurant,Ethnic Food,New York,The Little Owl,248.0,611.0,8.8,888.0,3.0,2358.0,20750.4
5573,Soho,40.722184,-74.000657,49bd4e25f964a52067541fe3,Equinox SoHo,40.723973,-73.997042,Gym,Recreation,New York,Equinox SoHo,129.0,636.0,8.9,799.0,0.0,2200.0,19580.0
6939,Steinway,40.775923,-73.90229,49c68da1f964a5205b571fe3,Martha's Country Bakery,40.773924,-73.907555,Bakery,CoffeeDessert,New York,Martha's Country Bakery,160.0,477.0,9.0,646.0,2.0,1760.0,15840.0
5580,Soho,40.722184,-74.000657,49cebdeaf964a520815a1fe3,Macao Trading Co.,40.719759,-74.003981,Asian Restaurant,Ethnic Food,New York,Macao Trading Co.,227.0,258.0,9.0,1002.0,3.0,1745.0,15705.0
6935,Steinway,40.775923,-73.90229,4f68de6bd5fbee32e5f4f3a5,SingleCut Beersmiths,40.778387,-73.901902,Brewery,Bar,New York,SingleCut Beersmiths,95.0,484.0,9.3,593.0,1.0,1656.0,15400.8
5612,West Village,40.734434,-74.00618,4a552b26f964a520c4b31fe3,Malatesta Trattoria,40.732808,-74.008522,Italian Restaurant,Ethnic Food,New York,Malatesta Trattoria,200.0,445.0,8.7,666.0,3.0,1756.0,15277.2
4742,Upper West Side,40.787658,-73.977059,4d68839467a8f04df7ee01be,Osteria Cotta,40.785578,-73.972836,Italian Restaurant,Ethnic Food,New York,Osteria Cotta,172.0,442.0,8.5,671.0,2.0,1727.0,14679.5
206,"Regent Park, Harbourfront",43.65426,-79.360636,51ddecee498e1ffd34185d2f,El Catrin,43.650601,-79.35892,Mexican Restaurant,Ethnic Food,Toronto,El Catrin,162.0,470.0,8.1,707.0,3.0,1809.0,14652.9


In [117]:
all_venues_and_details.to_pickle("all_venues_and_details.pkl")

In [70]:
toronto_venues_fun.shape #1622
#nyc_venues_fun.shape # 7495

(1622, 10)

<H1>This is where I added the City column into the venue details</H1>

In [53]:
# Combine whatever data we have in the venue details with what we have in toronto venues
# MERGE IN TORONTO
#city_fun = venue_details.merge(toronto_venues_fun, how='left', on='Venue_Id',left_index=True)

# MERGE IN NYC
city_fun = venue_details.merge(nyc_venues_fun, how='left', on='Venue_Id',left_index=True)

In [64]:
# BE CAREFUL WITH RUNNING THIS
# city_fun['City'] = 'Toronto'
#city_fun['City'] = 'Toronto'
city_fun[city_fun['Neighborhood'].isnull()==False]
#city_fun.head(30)

Unnamed: 0,Venue_Id,Name,TipsCount,LikesCount,Rating,RatingSignals,PriceTier,VoteSum,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory
6007.0,4d3e1be3ae942d43e09e9f9a,El Pequeno Coffee Shop,14,9,6.1,23,2,55,Jackson Heights,40.751981,-73.882821,El Pequeno Coffee Shop,40.748083,-73.880205,Latin American Restaurant,Ethnic Food
6009.0,5e44505ffdc6610008c7d741,Thaan,1,1,0.0,0,2,3,Jackson Heights,40.751981,-73.882821,Thaan,40.749849,-73.881865,Thai Restaurant,Ethnic Food
6010.0,4c671600aebea593c63075d0,Casa Colombia,3,4,5.8,9,2,20,Jackson Heights,40.751981,-73.882821,Casa Colombia,40.748263,-73.879485,South American Restaurant,Ethnic Food
6012.0,4b9a8df9f964a520acc035e3,La Gata Golosa Bakery,7,8,5.6,17,1,40,Jackson Heights,40.751981,-73.882821,La Gata Golosa Bakery,40.750458,-73.877701,South American Restaurant,Ethnic Food
6013.0,52f693e6498ed9d2a3fbad75,Eim Khao Mun Kai Elmhurst อิ่ม ข้าวมันไก่เอ็มเ...,26,99,8.7,128,1,352,Elmhurst,40.744049,-73.881656,Eim Khao Mun Kai Elmhurst อิ่ม ข้าวมันไก่เอ็มเ...,40.743014,-73.883057,Thai Restaurant,Ethnic Food
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9907.0,4bbe631782a2ef3b764b2bd2,Ralph's Famous Italian Ices,10,13,8.2,22,1,58,Middle Village,40.716415,-73.881143,Ralph's Famous Italian Ices,40.712856,-73.879132,Dessert Shop,CoffeeDessert
9909.0,4bd60bf4cfa7b713faec26da,Toyo,11,17,7.7,28,2,73,Middle Village,40.716415,-73.881143,Toyo,40.712611,-73.878948,Sushi Restaurant,Ethnic Food
9911.0,53372ea7498ed5a1955d86ef,Rico's Chicken,2,11,7.3,13,2,37,Middle Village,40.716415,-73.881143,Rico's Chicken,40.713009,-73.877336,South American Restaurant,Ethnic Food
9914.0,599b1f75ca18ea189fe0dfff,The Salvation Army Family Store & Donation Center,0,0,6.5,0,0,0,Middle Village,40.716415,-73.881143,The Salvation Army Family Store & Donation Center,40.712672,-73.878501,Thrift / Vintage Store,FunStore


In [52]:
city_fun.tail()

Unnamed: 0,Venue_Id,Name,TipsCount,LikesCount,Rating,RatingSignals,PriceTier,VoteSum,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory,City
,4bbe631782a2ef3b764b2bd2,Ralph's Famous Italian Ices,10,13,8.2,22,1,58,,,,,,,,,Toronto
,4bd60bf4cfa7b713faec26da,Toyo,11,17,7.7,28,2,73,,,,,,,,,Toronto
,53372ea7498ed5a1955d86ef,Rico's Chicken,2,11,7.3,13,2,37,,,,,,,,,Toronto
,599b1f75ca18ea189fe0dfff,The Salvation Army Family Store & Donation Center,0,0,6.5,0,0,0,,,,,,,,,Toronto
,4bc63c3651b376b095391b6f,SUBWAY,7,1,6.3,6,1,15,,,,,,,,,Toronto


In [45]:
# To remove rows by index
# venue_details = venue_details.drop(index=[12,13])
venue_details.tail(40)

Unnamed: 0,Venue_Id,Name,TipsCount,LikesCount,Rating,RatingSignals,PriceTier,VoteSum
559,4b7ff72ff964a520684730e3,Forest Hills Bagels,24,86,9.0,113,1,309
560,4c377a13ae2da59377f4fec5,Yellowstone Park,4,21,7.9,26,0,72
561,51c3a999498e5db561b5346a,iLoveKickboxing,21,6,8.9,27,0,60
562,4b97f9ebf964a520822335e3,MacDonald Park,8,31,8.1,41,0,111
563,46b1ef0df964a520a0491fe3,Bangkok Cuisine,34,62,8.5,93,1,251
564,4b7d8e1af964a5205dc62fe3,bambooYOGA,11,13,8.1,18,0,55
565,4bb6860ef562ef3b4c513097,Empire Wines & Liquors,6,25,8.0,35,0,91
566,54568cc5498ed42af61bc55d,Tu Casa Restaurant,4,10,7.9,15,2,39
567,4c64b110ee03ef3b735dbaac,Zen and Yoga,2,14,8.0,17,0,47
568,4f6cb7a0e4b0a6b770c9b395,Gloria Pizza,20,39,7.9,65,1,163


In [34]:
nyc_sum.head()
# Murray Hill is #1

Unnamed: 0,index,Neighborhood,Bar,CheapMeal,CoffeeDessert,Entertainment,Ethnic Food,Fancy Food,FunStore,Recreation,TotalFun
0,184,Murray Hill,11,19,14,2,59,5,0,13,123
1,252,South Side,22,10,9,2,35,3,1,8,90
2,6,Astoria,14,13,13,0,38,4,0,7,89
3,81,East Village,20,7,11,2,40,1,5,3,89
4,194,North Side,17,7,21,2,29,2,2,6,86


In [46]:
nyc_venues_fun[(nyc_venues_fun["Neighborhood"] == "Clinton") & (nyc_venues_fun["SimpleCategory"] == "Entertainment")]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue_Id,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory
4342,Clinton,40.759101,-73.996119,4b6a09cff964a5206ac32be3,Pershing Square Signature Theater,40.759228,-73.995232,Theater,Entertainment
4344,Clinton,40.759101,-73.996119,4c379ecf1e06d13a4e49763e,Playwrights Horizons,40.758818,-73.993574,Theater,Entertainment
4346,Clinton,40.759101,-73.996119,5a235575e179100e0425abc7,UCB Theatre Hell’s Kitchen,40.7605,-73.997863,Comedy Club,Entertainment
4358,Clinton,40.759101,-73.996119,5711b92e498e677b825cc78d,Yotel - Rooftop Cinema Club,40.759429,-73.995047,Movie Theater,Entertainment
4359,Clinton,40.759101,-73.996119,4b6cdf69f964a520af5a2ce3,The Actors Studio,40.760339,-73.99328,Theater,Entertainment
4367,Clinton,40.759101,-73.996119,4a0b4df9f964a520e8741fe3,Baryshnikov Arts Center,40.756283,-73.997181,Theater,Entertainment
4373,Clinton,40.759101,-73.996119,4ab424b4f964a520117020e3,The Westside Theater,40.75935,-73.992765,Indie Theater,Entertainment
4388,Clinton,40.759101,-73.996119,4fa709c3e4b047f0ed7001d7,The Alice Griffin Jewel Box Theatre,40.759463,-73.995185,Theater,Entertainment
4392,Clinton,40.759101,-73.996119,58a3c4cce9dad156c1d43d93,Green Room 42,40.759448,-73.995083,Performing Arts Venue,Entertainment


In [1]:
import numpy as np
import pandas as pd
import folium # map rendering library

In [2]:
all_venues_fun = pd.read_pickle("all_venues_fun.pkl")

In [81]:
venue_details = pd.read_pickle("venue_details.pkl")
venue_details.shape

(2377, 8)

In [82]:
#  This is basically all the venues in either NYC or TOR that have detail data
venues_with_details = venue_details.merge(all_venues_fun, on="Venue_Id", how="inner")
venues_with_details.shape

(2918, 17)

In [86]:
# venues_with_details has venue details outside of my top 10 neighborhoods from each city
# df[df['col1'].isin(['a', 'c', 'h'])]

top_tor = ['First Canadian Place, Underground city','Commerce Court, Victoria Hotel','Harbourfront East, Union Station, Toronto Islands','Toronto Dominion Centre, Design Exchange','Stn A PO Boxes, 25 The Esplanade, Enclave of M5E','Richmond, Adelaide, King','Garden District, Ryerson','Church and Wellesley','St. James Town','Central Bay Street']
top_nyc = ['Murray Hill','South Side','Astoria','East Village','North Side','Yorkville','West Village','Upper West Side','Financial District','Downtown']

topvenuedetails = venues_with_details[((venues_with_details['City'] == 'Toronto')\
                                      & (venues_with_details['Neighborhood'].isin(top_tor)))\
                                      | ((venues_with_details['City'] == 'New York')\
                                      & (venues_with_details['Neighborhood'].isin(top_nyc)))]

In [87]:
# THIS IS SOME GREAT DATA HERE - All venue details for top 10 neighs in Tor and Nyc
topvenuedetails.to_pickle("topvenuedetails.pkl")

In [88]:
# 900 items for nyc
topvenuedetails[topvenuedetails['City']=='New York'].shape

(900, 17)

In [90]:
nyc_details = topvenuedetails[topvenuedetails['City']=='New York']

In [89]:
# 750 items for nyc
topvenuedetails[topvenuedetails['City']=='Toronto'].shape

(750, 17)

In [91]:
tor_details = topvenuedetails[topvenuedetails['City']=='Toronto']

In [80]:
# Save venues_with_details
venues_with_details.to_pickle("venues_with_details.pkl")

In [34]:
tor_details = venues_with_details[(venues_with_details["City"]=="Toronto") & (venues_with_details["Rating"].isnull()==False) & (venues_with_details["Rating"] > 3)]
tor_details[0:50]

Unnamed: 0,Venue_Id,Name,TipsCount,LikesCount,Rating,RatingSignals,PriceTier,VoteSum,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory,City
0,537befcc498edb1da559269b,Symposium Cafe Restaurant & Lounge,11,30,6.4,52,4.0,123,"Willowdale, South",43.77012,-79.408493,Symposium Cafe Restaurant & Lounge,43.771075,-79.413396,Restaurant,CheapMeal,Toronto
1,4b75ce6af964a5201b262ee3,Subway,2,0,5.9,5,1.0,7,"Alderwood, Long Branch",43.602414,-79.543484,Subway,43.599152,-79.544395,Sandwich Place,CheapMeal,Toronto
6,4b5fcadff964a520eecc29e3,Wingporium,24,44,8.0,69,2.0,181,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Wingporium,43.630275,-79.518169,Wings Joint,CheapMeal,Toronto
7,4bc9f9b6b6c49c7469688f91,South St. Burger,11,25,8.1,35,1.0,96,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,South St. Burger,43.631314,-79.518408,Burger Joint,CheapMeal,Toronto
8,4b1d492af964a520370e24e3,Artisano Bakery Café,24,34,6.4,72,2.0,164,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Artisano Bakery Café,43.631006,-79.518172,Bakery,CoffeeDessert,Toronto
9,4c116455d41e76b09552310d,Subway,0,1,6.2,3,1.0,5,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Subway,43.631659,-79.519001,Sandwich Place,CheapMeal,Toronto
14,4b017e51f964a520d84222e3,LCBO,8,29,7.4,41,,107,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",43.643515,-79.577201,LCBO,43.642099,-79.576592,Liquor Store,Bar,Toronto
15,5cebda3300b068002dca9d81,Starbucks,1,2,6.9,3,1.0,8,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",43.643515,-79.577201,Starbucks,43.641312,-79.576924,Coffee Shop,CoffeeDessert,Toronto
16,4b251b4cf964a520586c24e3,Cafe Sympatico,6,7,6.5,13,1.0,33,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",43.643515,-79.577201,Cafe Sympatico,43.64182,-79.576721,Café,CheapMeal,Toronto
17,4cb4e17052edb1f7745763fe,Pizza Hut,0,1,6.4,1,1.0,3,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",43.643515,-79.577201,Pizza Hut,43.641514,-79.576326,Pizza Place,Ethnic Food,Toronto


<h2>Toronto map with Circle Markers based on Venue Simple Category</h2>

In [92]:
#-79.39868
map_toronto = folium.Map(location=[43.6550, -79.36], zoom_start=14,  
                         min_zoom=9, max_zoom=24)

# add markers to map
for lat, lng, rating, votesum, category, name in zip(tor_details['Venue Latitude'], 
                                           tor_details['Venue Longitude'], 
                                           tor_details['Rating'], 
                                           tor_details['VoteSum'],
                                           tor_details['SimpleCategory'],
                                           tor_details['Name']
                                              ):
    if rating < 6.0:
        ratcol = "#000000"
    elif (rating >= 6.0) & (rating < 7.0):
        ratcol = "#051094"
    elif (rating >= 7.0) & (rating < 8.0):
        ratcol = "#A32CC4"
    elif (rating >= 8.0) & (rating < 9.0):
        ratcol = "#BC544B"
    elif (rating >= 9.0) & (rating <= 10.0):
        ratcol = "#D21404"
        
    label = '{}:{}, Rating:{}, Votes:{}'.format(name,category, rating, votesum)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        weight=4,
        color=ratcol,
        fill=True,
        fill_color=ratcol,
        fill_opacity=1,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

<h2>NYC map with Circle Markers based on Venue Simple Category</h2>

In [93]:
#40.741223327240284, -73.98972094611943
map_nyc = folium.Map(location=[40.741223327240284, -73.98972094611943], zoom_start=14,  
                         min_zoom=9, max_zoom=24)

# add markers to map
for lat, lng, rating, votesum, category, name in zip(nyc_details['Venue Latitude'], 
                                           nyc_details['Venue Longitude'], 
                                           nyc_details['Rating'], 
                                           nyc_details['VoteSum'],
                                           nyc_details['SimpleCategory'],
                                           nyc_details['Name']):
    if (rating < 3.0): 
        ratcol = "#000000"                                 
    elif (rating >= 3.0) & (rating < 6.0):
        ratcol = "#65350F"
    elif (rating >= 6.0) & (rating < 7.0):
        ratcol = "#051094"
    elif (rating >= 7.0) & (rating < 8.0):
        ratcol = "#A32CC4"
    elif (rating >= 8.0) & (rating < 9.0):
        ratcol = "#BC544B"
    elif (rating >= 9.0) & (rating <= 10.0):
        ratcol = "#D21404"
        
    label = '{}:{}, Rating:{}, Votes:{}'.format(name,category, rating, votesum)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        weight=4,
        color=ratcol,
        fill=True,
        fill_color=ratcol,
        fill_opacity=1,
        parse_html=False).add_to(map_nyc)  
    
map_nyc

<h2>NYC map with ICONS based on Venue Simple Category</h2>


In [106]:
#40.741223327240284, -73.98972094611943
map_nyc = folium.Map(location=[40.741223327240284, -73.98972094611943], zoom_start=12,  
                         min_zoom=9, max_zoom=24)

# add markers to map
for lat, lng, rating, votesum, category, name, neighborhood in zip(nyc_details['Venue Latitude'], 
                                           nyc_details['Venue Longitude'], 
                                           nyc_details['Rating'], 
                                           nyc_details['VoteSum'],
                                           nyc_details['SimpleCategory'],
                                           nyc_details['Name'],
                                           nyc_details['Neighborhood']):
    
    if (rating < 3.0): 
        ratcol = 'black'
    elif (rating >= 3.0) & (rating < 6.0):
        ratcol = 'gray'
    elif (rating >= 6.0) & (rating < 7.0):
        ratcol = 'lightblue'
    elif (rating >= 7.0) & (rating < 8.0):
        ratcol = 'blue'
    elif (rating >= 8.0) & (rating < 9.0):
        ratcol = 'lightred'
    elif (rating >= 9.0) & (rating <= 10.0):
        ratcol = 'red' 
    
    if category == "Bar":
        icontype="glass"
    elif category == "Ethnic Food":
        icontype="trophy"
    elif category == "CheapMeal":
        icontype = "star-half-empty"
    elif category == "Recreation":
        icontype = "child"
    elif category == "FunStore":
        icontype == "gift"
    elif category == "Fancy Food":
        icontype = "dollar"
    elif category == "CoffeeDessert":
        icontype = "birthday-cake"

        
    label = '{} - {}:{}, Rating:{}, Votes:{}'.format(neighborhood, name,category, rating, votesum)
    label = folium.Popup(label, parse_html=True)
    
    folium.Marker(
        location=[lat, lng], 
        popup=label,
        #icon=folium.Icon(color=ratcol,icon="fa-utensils", prefix='fa')).add_to(map_nyc)
        icon=folium.Icon(color=ratcol,icon=icontype, prefix='fa')).add_to(map_nyc)
    

    
map_nyc

<h2>Toronto map with ICONS based on Venue Simple Category</h2>

In [105]:
icon_map_toronto = folium.Map(location=[43.6550, -79.38], zoom_start=15,  
                         min_zoom=9, max_zoom=24)

# add markers to map
for lat, lng, rating, votesum, category, name, neighborhood in zip(
                                           tor_details['Venue Latitude'], 
                                           tor_details['Venue Longitude'], 
                                           tor_details['Rating'], 
                                           tor_details['VoteSum'],
                                           tor_details['SimpleCategory'],
                                           tor_details['Name'],
                                           tor_details['Neighborhood']):
    
    # TEMP HACK
    if "Central Bay Street" not in neighborhood:
        continue
        
    # Colors avail:'gray', 'orange', 'cadetblue', 'beige', 'white', 'black', 'darkpurple', 'blue', 
    # 'lightgreen', 'darkgreen', 'lightred', 'pink', 'lightgray', 'green', 'darkred', 'darkblue', 'red', 'lightblue', 'purple'
    if (rating < 3.0): 
        ratcol = 'black' # "#000000"                                 
    elif (rating >= 3.0) & (rating < 6.0):
        ratcol = 'gray' # "#65350F"
    elif (rating >= 6.0) & (rating < 7.0):
        ratcol = 'lightblue' # "#051094"
    elif (rating >= 7.0) & (rating < 8.0):
        ratcol = 'blue' #  "#A32CC4"
    elif (rating >= 8.0) & (rating < 9.0):
        ratcol = 'lightred' # "#BC544B"
    elif (rating >= 9.0) & (rating <= 10.0):
        ratcol = 'red' #'red' #"#D21404"
    
    if category == "Bar":
        icontype="glass"
    elif category == "Ethnic Food":
        icontype="trophy"
    elif category == "CheapMeal":
        icontype = "star-half-empty"
    elif category == "Recreation":
        icontype = "child"
    elif category == "FunStore":
        icontype == "gift"
    elif category == "Fancy Food":
        icontype = "dollar"
    elif category == "CoffeeDessert":
        icontype = "birthday-cake"

        
    label = '{} - {}:{}, Rating:{}, Votes:{}'.format(neighborhood, name,category, rating, votesum)
    label = folium.Popup(label, parse_html=True)
    
    folium.Marker(
        location=[lat, lng], 
        popup=label,
        icon=folium.Icon(color=ratcol,icon=icontype, prefix='fa')).add_to(icon_map_toronto)
    

    
icon_map_toronto


In [1]:
import numpy as np
import pandas as pd

<h2>This pickle file has everything I needed for Mapping, and I will use this now to generate some statistics</h2>

In [3]:
venues_with_details = pd.read_pickle("venues_with_details.pkl")

In [4]:
tor_details = venues_with_details[venues_with_details["City"]=="Toronto"]
nyc_details = venues_with_details[venues_with_details["City"]=="New York"]

<h2>Removing some Columns I don't Need</h2>

In [6]:
tor_details.head(20)
drop_these = ['Neighborhood Latitude', 'Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude', 'Venue_Id']
tor_details.drop(drop_these, axis=1, inplace=True)
nyc_details.drop(drop_these, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [11]:
# Toronto Rating and PriceTier have NaN
# NYC so far has NO nulls in these columns
tor_details.fillna(value=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [44]:
nyc_details[0:10]

Unnamed: 0,Name,TipsCount,LikesCount,Rating,RatingSignals,PriceTier,VoteSum,Neighborhood,Venue Category,SimpleCategory,City
794,El Pequeno Coffee Shop,14,9,6.1,23,2,55,Jackson Heights,Latin American Restaurant,Ethnic Food,New York
795,Thaan,1,1,0.0,0,2,3,Jackson Heights,Thai Restaurant,Ethnic Food,New York
796,Casa Colombia,3,4,5.8,9,2,20,Jackson Heights,South American Restaurant,Ethnic Food,New York
797,La Gata Golosa Bakery,7,8,5.6,17,1,40,Jackson Heights,South American Restaurant,Ethnic Food,New York
798,Eim Khao Mun Kai Elmhurst อิ่ม ข้าวมันไก่เอ็มเ...,26,99,8.7,128,1,352,Elmhurst,Thai Restaurant,Ethnic Food,New York
799,Lamoon,9,35,8.6,46,2,125,Elmhurst,Thai Restaurant,Ethnic Food,New York
800,Louie's Pizzeria and Restaurant,26,63,9.1,91,1,243,Elmhurst,Pizza Place,Ethnic Food,New York
801,La Fusta,34,55,8.7,87,2,231,Elmhurst,Argentinian Restaurant,Ethnic Food,New York
802,Taste Good Malaysian Cuisine 好味,55,119,8.3,170,1,463,Elmhurst,Malay Restaurant,Ethnic Food,New York
803,Five Loaves and Two Fishes,3,5,8.2,13,1,26,Elmhurst,Chinese Restaurant,Ethnic Food,New York


In [20]:
topvenuedetails = pd.read_pickle("topvenuedetails.pkl")

In [23]:
tvd_groups = topvenuedetails.groupby(["City", "Neighborhood"])

In [25]:
tvd_group_stats = tvd_groups[["TipsCount","LikesCount","Rating","RatingSignals","PriceTier","VoteSum"]].mean()

In [26]:
tvd_group_stats = tvd_group_stats.reset_index()

In [31]:
tvd_group_stats = tvd_group_stats.sort_values(by="Rating", ascending=False,ignore_index=True)

<h1>Nearly all the most highly rated venues are in New York</h1>

In [32]:
tvd_group_stats.head(20)

Unnamed: 0,City,Neighborhood,TipsCount,LikesCount,Rating,RatingSignals,VoteSum
0,New York,West Village,109.267442,350.162791,8.681395,511.302326,1320.895349
1,New York,East Village,111.932584,297.179775,8.64382,472.258427,1178.550562
2,New York,North Side,86.976744,319.790698,8.623256,446.569767,1173.127907
3,New York,South Side,84.311111,270.544444,8.485556,376.8,1002.2
4,New York,Financial District,50.156627,195.048193,8.416867,264.036145,704.289157
5,New York,Upper West Side,60.529412,174.494118,8.247059,262.094118,671.611765
6,New York,Yorkville,28.848837,76.581395,8.096512,121.255814,303.267442
7,New York,Downtown,30.192771,119.096386,8.068675,167.855422,436.240964
8,New York,Murray Hill,39.764228,123.691057,8.021138,182.105691,469.252033
9,Toronto,"Commerce Court, Victoria Hotel",24.448276,74.057471,7.970115,105.528736,278.091954


In [38]:
# tvd_group_stats['Rating'].corr(tvd_group_stats['RatingSignals']) #0.831783606731402
# tvd_group_stats['Rating'].corr(tvd_group_stats['LikesCount']) #0.8276978097445825
nyc_stats = tvd_group_stats[tvd_group_stats['City']=='New York'] #['Rating'].corr(tvd_group_stats['LikesCount'])
tor_stats = tvd_group_stats[tvd_group_stats['City']=='Toronto']

<h2>When we adjust Toronto's venue "votes" to consider it's population, it has more than NYC</h2>

In [43]:
print('Mean NYC Rating: ' + str(nyc_stats['Rating'].mean()))
print('Mean NYC Votes: ' + str(nyc_stats['VoteSum'].mean()))
print('Mean Toronto Rating: ' + str(tor_stats['Rating'].mean()))
print('Mean Toronto Votes: ' + str(tor_stats['VoteSum'].mean()))
print('Mean Toronto Votes (Adusted for Population): ' + str(tor_stats['VoteSum'].mean() * 2.98) )


Mean NYC Rating: 8.325056990939
Mean NYC Votes: 750.0727312011767
Mean Toronto Rating: 7.782194290535228
Mean Toronto Votes: 342.5272607991777
Mean Toronto Votes (Adusted for Population): 1020.7312371815497


In [None]:
# NYC Population 8,804,190 as of April 1, 2020
# TOR Population 2,956,024
# NYC is 2.98 times larger

In [46]:
topvenuedetails.head()

Unnamed: 0,Venue_Id,Name,TipsCount,LikesCount,Rating,RatingSignals,PriceTier,VoteSum,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,SimpleCategory,City
87,52138db911d22803b334c641,Mos Mos Coffee,11,54,8.5,69,1,188,St. James Town,43.651494,-79.375418,Mos Mos Coffee,43.648159,-79.378745,Café,CheapMeal,Toronto
88,52138db911d22803b334c641,Mos Mos Coffee,11,54,8.5,69,1,188,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,Mos Mos Coffee,43.648159,-79.378745,Café,CheapMeal,Toronto
89,52138db911d22803b334c641,Mos Mos Coffee,11,54,8.5,69,1,188,"Commerce Court, Victoria Hotel",43.648198,-79.379817,Mos Mos Coffee,43.648159,-79.378745,Café,CheapMeal,Toronto
90,52138db911d22803b334c641,Mos Mos Coffee,11,54,8.5,69,1,188,"Stn A PO Boxes, 25 The Esplanade, Enclave of M5E",43.646435,-79.374846,Mos Mos Coffee,43.648159,-79.378745,Café,CheapMeal,Toronto
91,52138db911d22803b334c641,Mos Mos Coffee,11,54,8.5,69,1,188,"First Canadian Place, Underground city",43.648429,-79.38228,Mos Mos Coffee,43.648159,-79.378745,Café,CheapMeal,Toronto


In [56]:
nyc_counts = topvenuedetails[topvenuedetails["City"] == "New York"]
nyc_counts = nyc_counts[["Neighborhood","SimpleCategory"]]

Unnamed: 0,Neighborhood,SimpleCategory
1204,Upper West Side,Entertainment
1205,Upper West Side,Ethnic Food
1206,Upper West Side,Entertainment
1207,Upper West Side,Fancy Food
1208,Upper West Side,Ethnic Food


In [63]:
# Get distinct combinations of neighborhood and city - looks like some city data is wrong
neigh_city = topvenuedetails[["City", "Neighborhood", "SimpleCategory"]]#.sort_values(by=['City','Neighborhood','SimpleCategory'], inplace=True)
neigh_values = neigh_city.value_counts()

<h1>Looking at each neighborhood, and how many venues it has by category</h1>

In [74]:
#neigh_values[neigh_values["City"]=='New York']
#data.groupby('month')['duration'].sum()
pd.set_option('display.max_rows', 500)
neigh_city.groupby(['City','Neighborhood','SimpleCategory'])['SimpleCategory'].count()

City      Neighborhood                                       SimpleCategory
New York  Astoria                                            Bar               14
                                                             CheapMeal         13
                                                             CoffeeDessert     13
                                                             Ethnic Food       38
                                                             Fancy Food         4
                                                             Recreation         7
          Downtown                                           Bar                7
                                                             CheapMeal         13
                                                             CoffeeDessert     11
                                                             Entertainment      4
                                                             Ethnic Food       28
                      

In [75]:
neigh_city.groupby(['SimpleCategory','City','Neighborhood'])['SimpleCategory'].count()

SimpleCategory  City      Neighborhood                                     
Bar             New York  Astoria                                              14
                          Downtown                                              7
                          East Village                                         20
                          Financial District                                    8
                          Murray Hill                                          11
                          North Side                                           17
                          South Side                                           22
                          Upper West Side                                      13
                          West Village                                         15
                          Yorkville                                             9
                Toronto   Central Bay Street                                    2
                      

In [79]:
neigh_city.groupby(['SimpleCategory','City'])['SimpleCategory'].count() \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False) \
    .head(40)

Unnamed: 0,SimpleCategory,City,count
8,Ethnic Food,New York,346
9,Ethnic Food,Toronto,206
5,CoffeeDessert,Toronto,146
0,Bar,New York,136
4,CoffeeDessert,New York,129
3,CheapMeal,Toronto,128
2,CheapMeal,New York,108
14,Recreation,New York,97
1,Bar,Toronto,81
15,Recreation,Toronto,70


In [78]:
neigh_city.groupby(['SimpleCategory','City','Neighborhood'])['SimpleCategory'].count() \
    .reset_index(name='count') \
    .sort_values(['count'], ascending=False) \
    .head(40)

Unnamed: 0,SimpleCategory,City,Neighborhood,count
83,Ethnic Food,New York,Murray Hill,59
81,Ethnic Food,New York,East Village,40
79,Ethnic Food,New York,Astoria,38
86,Ethnic Food,New York,Upper West Side,36
85,Ethnic Food,New York,South Side,35
88,Ethnic Food,New York,Yorkville,30
84,Ethnic Food,New York,North Side,29
87,Ethnic Food,New York,West Village,29
80,Ethnic Food,New York,Downtown,28
95,Ethnic Food,Toronto,"Richmond, Adelaide, King",26


<h1>Average Rating By Neighborhood</h1>

In [81]:
neigh_rating = topvenuedetails[["City", "Neighborhood", "Rating"]]
neigh_rating.groupby(['City','Neighborhood'])['Rating'].mean() \
    .reset_index(name='mean') \
    .sort_values(['mean'], ascending=False) \
    .head(40)

Unnamed: 0,City,Neighborhood,mean
8,New York,West Village,8.681395
2,New York,East Village,8.64382
5,New York,North Side,8.623256
6,New York,South Side,8.485556
3,New York,Financial District,8.416867
7,New York,Upper West Side,8.247059
9,New York,Yorkville,8.096512
1,New York,Downtown,8.068675
4,New York,Murray Hill,8.021138
12,Toronto,"Commerce Court, Victoria Hotel",7.970115


In [92]:
neigh_diversity = topvenuedetails[["City", "Neighborhood", "SimpleCategory", "Venue Category"]]
diversity = neigh_diversity.groupby(['City','Neighborhood', "SimpleCategory"])['Venue Category'] \
    .apply(lambda x: len(np.unique(x))).to_frame()


In [94]:
diversity.reset_index(inplace=True)

In [96]:
diversity.sort_values(by="Venue Category", ascending=False)

Unnamed: 0,City,Neighborhood,SimpleCategory,Venue Category
56,New York,Upper West Side,Ethnic Food,23
33,New York,Murray Hill,Ethnic Food,23
10,New York,Downtown,Ethnic Food,22
18,New York,East Village,Ethnic Food,22
3,New York,Astoria,Ethnic Food,22
48,New York,South Side,Ethnic Food,21
128,Toronto,"Richmond, Adelaide, King",Ethnic Food,18
40,New York,North Side,Ethnic Food,17
88,Toronto,Church and Wellesley,Ethnic Food,15
136,Toronto,St. James Town,Ethnic Food,15


### Importing Needed packages

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline

### Loading data, and filtering out data with venues but in wrong neighborhoods

In [3]:
venues_with_details = pd.read_pickle("venues_with_details.pkl")
core_df = venues_with_details[['City','Neighborhood', 'SimpleCategory','TipsCount', 'LikesCount', 'Rating', 'PriceTier']]
nyc_neighs = ['Astoria','Downtown ','East Village','Financial District','Murray Hill','North Side','South Side','Upper West Side','West Village','Yorkville']
tor_neighs = ['Central Bay Street','Church and Wellesley','Commerce Court, Victoria Hotel','First Canadian Place, Underground city','Garden District, Ryerson','Harbourfront East, Union Station, Toronto Islands','Richmond, Adelaide, King','St. James Town','Stn A PO Boxes, 25 The Esplanade, Enclave of M5E','Toronto Dominion Centre, Design Exchange']
core_df = core_df[((core_df['City']=='New York') \
                 & (core_df['Neighborhood'].isin(nyc_neighs))) \
                 | ((core_df['City']=='Toronto') \
                 & (core_df['Neighborhood'].isin(tor_neighs))) ]

### Create filters for PriceTiers

In [11]:
# Filter out data where PriceTier or Rating are na
# use tilde in front of filter to negate this
yes_price_tier_filt = (core_df['Rating'].isnull()==False) & (core_df['PriceTier'].isnull()==False) & (core_df['PriceTier'] > 0)
no_price_tier_filt = (core_df['Rating'].isnull()==False) & ((core_df['PriceTier'].isnull()==True) | (core_df['PriceTier'] == 0))

price_df = core_df[yes_price_tier_filt]
no_price_df = core_df[no_price_tier_filt]

### Do we have any nulls? We know we have them for no_price_df

In [12]:
price_df[price_df.isna().any(axis=1)]

Unnamed: 0,City,Neighborhood,SimpleCategory,TipsCount,LikesCount,Rating,PriceTier


In [13]:
no_price_df[no_price_df.isna().any(axis=1)]
# 88 Prices we want to predict

Unnamed: 0,City,Neighborhood,SimpleCategory,TipsCount,LikesCount,Rating,PriceTier
97,Toronto,St. James Town,FunStore,3,19,8.4,
99,Toronto,"Stn A PO Boxes, 25 The Esplanade, Enclave of M5E",FunStore,3,19,8.4,
104,Toronto,"Toronto Dominion Centre, Design Exchange",Entertainment,44,264,7.5,
105,Toronto,"Stn A PO Boxes, 25 The Esplanade, Enclave of M5E",Entertainment,44,264,7.5,
112,Toronto,"Commerce Court, Victoria Hotel",Recreation,5,7,8.0,
...,...,...,...,...,...,...,...
677,Toronto,"Toronto Dominion Centre, Design Exchange",CheapMeal,15,70,7.6,
678,Toronto,"First Canadian Place, Underground city",CheapMeal,15,70,7.6,
682,Toronto,"Toronto Dominion Centre, Design Exchange",Entertainment,0,5,6.8,
687,Toronto,"Garden District, Ryerson",CoffeeDessert,2,7,6.9,


### Create one-hot encoding for neighborhoods

In [14]:
# Since dummies will prefix the column name, shorten string by changing the name to "Cat"
core_df.rename(columns={'SimpleCategory':'Cat'}, inplace=True)

In [15]:
dummies_df = pd.get_dummies(core_df, columns=['City','Neighborhood','Cat'], drop_first=True)

In [16]:
price_df = dummies_df[yes_price_tier_filt]
no_price_df = dummies_df[no_price_tier_filt]

### Lists of columns for multiple linear regression

In [25]:
# Do not include 'PriceTier'
columns = ['TipsCount','LikesCount','Rating','City_Toronto','Neighborhood_Central Bay Street','Neighborhood_Church and Wellesley','Neighborhood_Commerce Court, Victoria Hotel','Neighborhood_East Village','Neighborhood_Financial District','Neighborhood_First Canadian Place, Underground city','Neighborhood_Garden District, Ryerson','Neighborhood_Harbourfront East, Union Station, Toronto Islands','Neighborhood_Murray Hill','Neighborhood_North Side','Neighborhood_Richmond, Adelaide, King','Neighborhood_South Side','Neighborhood_St. James Town','Neighborhood_Stn A PO Boxes, 25 The Esplanade, Enclave of M5E','Neighborhood_Toronto Dominion Centre, Design Exchange','Neighborhood_Upper West Side','Neighborhood_West Village','Neighborhood_Yorkville','Cat_CheapMeal','Cat_CoffeeDessert','Cat_Entertainment','Cat_Ethnic Food','Cat_Fancy Food','Cat_FunStore','Cat_Recreation']

In [None]:
from sklearn import linear_model

In [26]:
regr = linear_model.LinearRegression()
x = np.asanyarray(price_df[columns])
y = np.asanyarray(price_df[['PriceTier']])
regr.fit (x, y)
# The coefficients
print ('Coefficients: ', regr.coef_)

Coefficients:  [[ 6.75136551e-04  2.41308131e-04  7.75082510e-02  3.00759356e-01
   7.39359050e-02 -1.13685256e-03  1.39490743e-01  8.52406156e-02
   4.27930889e-01  4.01063812e-02 -2.88683758e-01 -2.81365386e-02
   3.34994425e-01 -6.72610759e-02  1.69199005e-01  3.75455627e-02
   1.13179675e-01  3.91052335e-02  4.36995633e-02  2.97631441e-01
   5.58862378e-01  2.70617022e-01 -6.26050705e-01 -9.00491778e-01
   1.05027400e+00 -6.23640029e-02  6.79064749e-01  0.00000000e+00
   0.00000000e+00]]


### Make coefficients easier to read

In [29]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [47]:
coef_cols = pd.DataFrame(train_coef, columns=['Coef'])
coef_cols['Source'] = columns
coef_cols.sort_values(by=['Coef'], axis=0, ascending=False, inplace=True)
print(coef_cols)

        Coef                                             Source
24  1.050274                                  Cat_Entertainment
26  0.679065                                     Cat_Fancy Food
20  0.558862                          Neighborhood_West Village
8   0.427931                    Neighborhood_Financial District
12  0.334994                           Neighborhood_Murray Hill
3   0.300759                                       City_Toronto
19  0.297631                       Neighborhood_Upper West Side
21  0.270617                             Neighborhood_Yorkville
14  0.169199              Neighborhood_Richmond, Adelaide, King
6   0.139491        Neighborhood_Commerce Court, Victoria Hotel
16  0.113180                        Neighborhood_St. James Town
7   0.085241                          Neighborhood_East Village
2   0.077508                                             Rating
4   0.073936                    Neighborhood_Central Bay Street
18  0.043700  Neighborhood_Toronto Domin

In [30]:
arr2 = np.sort(regr.coef_, axis = -1)        
print(arr2)


[[-0.900 -0.626 -0.289 -0.067 -0.062 -0.028 -0.001 0.000 0.000 0.000
  0.001 0.038 0.039 0.040 0.044 0.074 0.078 0.085 0.113 0.139 0.169 0.271
  0.298 0.301 0.335 0.428 0.559 0.679 1.050]]


In [49]:
y_hat= regr.predict(no_price_df[columns])


### I want to add the predictions to the no price data

In [50]:
no_price_df['PriceTier'] = y_hat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_price_df['PriceTier'] = y_hat


In [54]:
price_filt = no_price_df['PriceTier'] > 3

no_price_df[price_filt].sort_values(by=['PriceTier'], ascending=False).head(25)

Unnamed: 0,TipsCount,LikesCount,Rating,PriceTier,City_Toronto,Neighborhood_Central Bay Street,Neighborhood_Church and Wellesley,"Neighborhood_Commerce Court, Victoria Hotel",Neighborhood_East Village,Neighborhood_Financial District,...,Neighborhood_Upper West Side,Neighborhood_West Village,Neighborhood_Yorkville,Cat_CheapMeal,Cat_CoffeeDessert,Cat_Entertainment,Cat_Ethnic Food,Cat_Fancy Food,Cat_FunStore,Cat_Recreation
154,273,2191,8.7,3.929128,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
155,273,2191,8.7,3.924534,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1836,300,2217,8.7,3.881795,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
153,273,2191,8.7,3.857292,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1825,275,1439,8.8,3.68493,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1859,177,844,9.2,3.568563,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1398,37,140,8.8,3.497028,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
343,79,407,8.7,3.463449,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
319,42,318,8.7,3.446701,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
331,44,222,8.6,3.417135,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
