# Introduction/Business Problem

I will soon be moving to London to start a new job, and I would like to create a recommender system that recommends areas to live in based on certain criteria (eg. types of nearby venues, average rent prices).

The stakeholders will be people who are moving to London and want to find a neighbourhood that fits their lifestyle.

# Data

I will need the following data:

- FourSquare data on nearby venues
- User criteria
- Average rent prices by postcode

# Gathering Data

In [256]:
import pandas as pd
import numpy as np

import requests
import folium

from geopy.geocoders import Nominatim

### Data on Rent Prices in London by Area

In [2]:
dataframes = pd.read_html('https://www.rentbarometer.com/london/all-prices/by-name.html')

In [3]:
prices = pd.concat(dataframes)

prices.reset_index(drop=True, inplace=True)

In [4]:
prices.head()

Unnamed: 0,Place,Studios,One beds,Two beds,Three beds,Four beds,Five beds
0,"Acton, W3",£232,£302,£372,£513,£618,
1,"Anerley, SE20",£208,£300,£340,£444,,
2,"Angel, EC1V",£387,£439,£578,£707,£778,
3,"Baker street, NW1",£330,£482,£667,"£1,171","£1,350",
4,"Balham, SW12",£267,£348,£435,£507,£622,"£1,063"


### Cleaning Prices Data

In [5]:
prices['Area'] = prices['Place'].apply(lambda x: x.split(',')[0])
prices['Postcode'] = prices['Place'].apply(lambda x: x.split(',')[-1])
prices['Address'] = prices['Area'] + ', ' + prices['Postcode'] + ',' + ' London'
prices.drop('Place', axis=1, inplace=True)

In [6]:
def clean_prices(x):
    if x != None:
        x = str(x).replace('£','')
        x = x.replace(',','')
        x = float(x)
    return x

In [7]:
property_types = prices.columns[:-3]
prices[property_types] = prices[property_types].applymap(clean_prices)

In [8]:
prices.dtypes

Studios       float64
One beds      float64
Two beds      float64
Three beds    float64
Four beds     float64
Five beds     float64
Area           object
Postcode       object
Address        object
dtype: object

In [9]:
prices.dropna(subset=['Address'], inplace=True)

In [10]:
prices.head()

Unnamed: 0,Studios,One beds,Two beds,Three beds,Four beds,Five beds,Area,Postcode,Address
0,232.0,302.0,372.0,513.0,618.0,,Acton,W3,"Acton, W3, London"
1,208.0,300.0,340.0,444.0,,,Anerley,SE20,"Anerley, SE20, London"
2,387.0,439.0,578.0,707.0,778.0,,Angel,EC1V,"Angel, EC1V, London"
3,330.0,482.0,667.0,1171.0,1350.0,,Baker street,NW1,"Baker street, NW1, London"
4,267.0,348.0,435.0,507.0,622.0,1063.0,Balham,SW12,"Balham, SW12, London"


### Get coordinates of each area

In [11]:
def get_coordinates(address):
    geolocator = Nominatim(user_agent='london_explorer')
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
    except:
        try:
            postcode = ', '.join(address.split(', ')[1:])
            location = geolocator.geocode(postcode)
            latitude = location.latitude
            longitude = location.longitude
        except:
            latitude = np.nan
            longitude = np.nan
        
    
    return latitude, longitude

In [12]:
for i in prices.index:
    address = prices.loc[i,'Address']
    coordinates = get_coordinates(address)
    prices.loc[i,'Latitude'] = coordinates[0]
    prices.loc[i,'Longitude'] = coordinates[1]

In [13]:
prices.to_csv('prices.csv', index=False)

### Get FourSquare Data on density of venues

In [43]:
CLIENT_ID = '0HVYDM4WGFXRAB4PZMWWL5CRSN0U2TSLY321JKQV4O4ISMYO'
CLIENT_SECRET = 'XSEJXFD23WR30NX2GGYMWCB3OMMNINRKPWMIMIKTASJBSPIG'
VERSION = '20180323'
radius = 2000 
LIMIT = 100

In [44]:
def getNearbyVenues(names, latitudes, longitudes, radius=radius):
        
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        try:

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
                LIMIT)

            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except KeyError:
            print(f'No results found for {name}')
            

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Area', 
                  'Area Latitude', 
                  'Area Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [45]:
london_venues = getNearbyVenues(names=prices['Area'],
                                   latitudes=prices['Latitude'],
                                   longitudes=prices['Longitude']
                                  )

No results found for Boston Manor
No results found for St James's Park
No results found for The City
No results found for Walworth


In [46]:
london_venues.head()

Unnamed: 0,Area,Area Latitude,Area Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Acton,51.50814,-0.273261,London Star Hotel,51.509624,-0.272456,Hotel
1,Acton,51.50814,-0.273261,Acton Centre,51.506608,-0.266878,Gym / Fitness Center
2,Acton,51.50814,-0.273261,MrBakeme,51.508452,-0.268543,Creperie
3,Acton,51.50814,-0.273261,Dragonfly Brewery at George & Dragon,51.507378,-0.271702,Brewery
4,Acton,51.50814,-0.273261,The Aeronaut,51.508376,-0.275216,Pub


In [47]:
london_venues.to_csv('london_venues.csv', index=False)

### Creating Dummy variables for London venue types

In [48]:
# one hot encoding
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")

# add postal code column back to dataframe
london_onehot['Area'] = london_venues['Area'] 

# move neighborhood column to the first column
fixed_columns = [london_onehot.columns[-1]] + list(london_onehot.columns[:-1])
london_onehot = london_onehot[fixed_columns]

london_onehot.head()

Unnamed: 0,Area,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Arcade,Arepa Restaurant,Argentinian Restaurant,...,Windmill,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Acton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Acton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Acton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Acton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Acton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Grouping by Area

In [49]:
london_grouped = london_onehot.groupby('Area').mean().reset_index()

In [50]:
london_grouped.head()

Unnamed: 0,Area,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Arcade,Arepa Restaurant,Argentinian Restaurant,...,Windmill,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Acton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0
1,Anerley,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Angel,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0
3,Baker street,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.02
4,Balham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0


### Merge with London Rent Prices

In [51]:
london_areas = prices.merge(london_grouped, on = 'Area')

In [52]:
london_areas.head()

Unnamed: 0,Studios,One beds,Two beds,Three beds,Four beds,Five beds,Area,Postcode,Address,Latitude,...,Windmill,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,232.0,302.0,372.0,513.0,618.0,,Acton,W3,"Acton, W3, London",51.50814,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0
1,208.0,300.0,340.0,444.0,,,Anerley,SE20,"Anerley, SE20, London",51.407599,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,387.0,439.0,578.0,707.0,778.0,,Angel,EC1V,"Angel, EC1V, London",51.526708,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0
3,330.0,482.0,667.0,1171.0,1350.0,,Baker street,NW1,"Baker street, NW1, London",51.524767,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.02
4,267.0,348.0,435.0,507.0,622.0,1063.0,Balham,SW12,"Balham, SW12, London",51.445645,...,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0


In [86]:
venue_names = list(london_grouped.columns[1:])

### Set some criteria

In [283]:
criteria = {'price_max': 250,
            'property_type': 'Studios',
            'must_have_venues' : ['Gym', 'Restaurant', 'Club', 'Bar']}

In [195]:
property_filt = london_areas[criteria['property_type']] < criteria['price_max']

filtered_areas = london_areas[property_filt].reset_index(drop=True)

In [244]:
def get_similar_venues(criteria, venue_names):
    
    similar_venues_dict = dict()

    for venue in criteria['must_have_venues']:
        similar_venues = []
        for name in venue_names:
            if venue.lower() in name.lower():
                similar_venues.append(name)     
        similar_venues_dict[venue] = similar_venues
    
    return similar_venues_dict


In [245]:
similar_venues_dict = get_similar_venues(criteria, venue_names)

In [284]:
boolean_list = []
for venue in similar_venues_dict.keys():
    # Get dataframe which checks whether any of the Similar Venues to our
    #'Must Have' Venue has a value above 0
    # At least one of the Similar Venues to our Venue must have a value above 0
    boolean_df = (filtered_areas[similar_venues_dict[venue]] > 0)
    # We sum each row of Booleans to get a number. If this number is 0 that means
    # that there is no Similar Venue in this area.
    boolean_list.append(boolean_df.sum(axis=1))

In [219]:
# Example Boolean Dataframe
boolean_df.head()

Unnamed: 0,Bar,Beer Bar,Cocktail Bar,Gay Bar,Hookah Bar,Hotel Bar,Juice Bar,Sake Bar,Salon / Barbershop,Sports Bar,Whisky Bar,Wine Bar
0,False,False,False,False,False,False,False,False,False,False,False,False
1,True,False,True,False,False,False,False,False,False,False,False,False
2,True,True,True,False,False,True,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False


In [220]:
# Example of a series which is the sum of each row in the Boolean Dataframe
boolean_df_sum

0     0
1     2
2     5
3     0
4     0
5     2
6     3
7     0
8     2
9     2
10    3
11    2
12    4
13    3
14    4
15    4
16    5
17    2
18    2
19    4
20    4
21    2
22    3
23    1
24    4
25    1
dtype: int64

In [228]:
# We concat each series into one dataframe and check for each 'Venue' whether 
# any values are equal to zero.
venues_boolean_df = (pd.concat(boolean_list, axis=1) == 0)

venues_boolean_df.head()

Unnamed: 0,0,1,2,3
0,False,False,False,True
1,False,False,True,False
2,False,False,False,False
3,False,False,True,True
4,False,False,False,True


In [229]:
# Add a Boolean Sum column to our Filtered Areas dataframe to easily filter
# for areas which match our criteria
filtered_areas['boolean_sum'] = venues_boolean_df.sum(axis=1)

In [240]:
suggested_areas = filtered_areas[filtered_areas['boolean_sum'] == 0]
suggested_areas.head()

Unnamed: 0,Studios,One beds,Two beds,Three beds,Four beds,Five beds,Area,Postcode,Address,Latitude,...,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit,boolean_sum
2,248.0,335.0,442.0,684.0,809.0,1473.0,Brook Green,W6,"Brook Green, W6, London",51.496021,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
14,248.0,272.0,347.0,374.0,,,Lewisham,SE13,"Lewisham, SE13, London",51.465633,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0
18,232.0,274.0,352.0,505.0,585.0,,Southfields,SE14,"Southfields, SE14, London",51.445775,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0
20,246.0,356.0,411.0,480.0,526.0,369.0,Stratford,E15,"Stratford, E15, London",51.541289,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0
21,213.0,288.0,345.0,425.0,537.0,,Streatham,SW16,"Streatham, SW16, London",51.429769,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [241]:
suggested_areas[similar_venues_dict['Gym']]

Unnamed: 0,Boxing Gym,Climbing Gym,Gym,Gym / Fitness Center,Gym Pool,Gymnastics Gym
2,0.0,0.0,0.0,0.03,0.0,0.0
14,0.0,0.0,0.01,0.02,0.0,0.0
18,0.0,0.0,0.03,0.02,0.0,0.0
20,0.0,0.0,0.01,0.01,0.0,0.0
21,0.0,0.0,0.0,0.03,0.0,0.0


In [243]:
suggested_areas[similar_venues_dict['Bar']]

Unnamed: 0,Bar,Beer Bar,Cocktail Bar,Gay Bar,Hookah Bar,Hotel Bar,Juice Bar,Sake Bar,Salon / Barbershop,Sports Bar,Whisky Bar,Wine Bar
2,0.02,0.02,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01
14,0.02,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01
18,0.03,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,0.04,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
21,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01


## Let's create a function that suggests areas given a set of criteria

In [246]:
def get_suggested_areas(criteria: dict, venue_names: list):
    """
    Criteria dictionary must be in this form:
    
    {'price_max': 250,
    'property_type': 'Studios',
    'must_have_venues' : ['Gym', 'Restaurant', 'Club', 'Bar']}
    """
    property_filt = london_areas[criteria['property_type']] < criteria['price_max']
    filtered_areas = london_areas[property_filt].reset_index(drop=True)
    
    similar_venues_dict = get_similar_venues(criteria, venue_names)
    
    boolean_list = []
    
    for venue in similar_venues_dict.keys():
        boolean_df = (filtered_areas[similar_venues_dict[venue]] > 0)
        boolean_list.append(boolean_df.sum(axis=1))
    
    venues_boolean_df = (pd.concat(boolean_list, axis=1) == 0)
    filtered_areas['boolean_sum'] = venues_boolean_df.sum(axis=1)
    suggested_areas = filtered_areas[filtered_areas['boolean_sum'] == 0]

    return suggested_areas

    

In [253]:
criteria = {'price_max': 300,
            'property_type': 'One beds',
            'must_have_venues' : ['Gym', 'Restaurant', 'Store', 'Bar']}

In [270]:
suggested_areas = get_suggested_areas(criteria, venue_names)

suggested_areas.head()

Unnamed: 0,Studios,One beds,Two beds,Three beds,Four beds,Five beds,Area,Postcode,Address,Latitude,...,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit,boolean_sum
0,,296.0,368.0,486.0,561.0,,Bow,E3,"Bow, E3, London",51.530938,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,,284.0,348.0,465.0,531.0,,Brockley,SE4,"Brockley, SE4, London",51.464503,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,,248.0,325.0,370.0,485.0,,Bromley,BR1,"Bromley, BR1, London",51.402805,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,,287.0,369.0,438.0,513.0,,Charlton,SE3,"Charlton, SE3, London",51.486755,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,209.0,292.0,334.0,385.0,606.0,531.0,Crystal Palace,SE19,"Crystal Palace, SE19, London",51.421406,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Plot the points on a map!

In [281]:
latitude = 51.4975
longitude = -0.1357

def plot_map(latitude, longitude, df, label_column):

    # create map
    map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

    # add markers to the map
    for lat, lon, point in zip(df['Latitude'], df['Longitude'], df[label_column]):
        label = folium.Popup(str(point), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=1).add_to(map_clusters)

    return map_clusters

In [282]:
plot_map(latitude, longitude, suggested_areas, 'Area')