Content-Based Recommender System

Advantages:
* Able to make recommendation even with insufficient rating data
* Recommendations can still be made even if there is no historical ratings for a particular item

Disadvantages:
* Specific to user needs, and does not account for community view.
* Not effective for making decisions for new users
* Requires user to have large number of ratings available for the target user to prevent overfitting

In [4]:
import yelp_api
import pandas as pd
from tqdm import tqdm

In [5]:
# list of locations - Singapore
locations = [
  "Bishan",
  "Bishan East",
  "Marymount",
  "Upper Thomson",
  "Bukit Merah",
  "Alexandra Hill",
  "Alexandra North",
  "Bukit Ho Swee",
  "Bukit Merah",
  "Depot Road",
  "Everton Park",
  "Henderson Hill",
  "Kampong Tiong Bahru",
  "Maritime Square",
  "HarbourFront",
  "Singapore General Hospital",
  "Telok Blangah",
  "Telok Blangah",
  "Telok Blangah",
  "Tiong Bahru",
  "Tiong Bahru",
  "Bukit Timah",
  "Holland Road",
  "Downtown Core",
  "Anson",
  "Bayfront Subzone",
  "Bugis",
  "City Hall",
  "Clifford Pier",
  "Marina Centre",
  "Raffles Place",
  "Tanjong Pagar",
  "Geylang",
  "Aljunied",
  "Geylang East",
  "Kampong Ubi",
  "MacPherson",
  "Kallang",
  "Bendemeer",
  "Boon Keng",
  "Crawford",
  "Geylang Bahru",
  "Kallang Bahru",
  "Kampong Bugis",
  "Kampong Java",
  "Lavender",
  "Tanjong Rhu",
  "Marina East",
  "Marina East",
  "Marina South",
  "Marina South",
  "Marine Parade",
  "East Coast",
  "Katong",
  "Mountbatten",
  "Museum",
  "Bras Basah",
  "Dhoby Ghaut",
  "Fort Canning",
  "Newton",
  "Goodwood Park",
  "Istana Negara",
  "Novena",
  "Balestier",
  "Orchard",
  "Outram",
  "Chinatown",
  "Pearl's Hill",
  "Queenstown",
  "Commonwealth",
  "Dover",
  "Ghim Moh",
  "Holland Drive",
  "Kent Ridge",
  "National University of Singapore",
  "one-north",
  "Pasir Panjang",
  "Pasir Panjang",
  "Port",
  "Singapore Polytechnic",
  "River Valley",
  "Rochor",
  "Bencoolen",
  "Farrer Park",
  "Kampong Glam",
  "Little India",
  "Mount Emily",
  "Rochor Canal",
  "Sungei Road",
  "Victoria",
  "Singapore River",
  "Boat Quay",
  "Clarke Quay",
  "Robertson Quay",
  "Southern Islands",
  "Sentosa",
  "Straits View",
  "Straits View",
  "Tanglin",
  "Tyersall",
  "Toa Payoh",
  "Bidadari",
  "Joo Seng",
  "Potong Pasir",
  "Bedok",
  "Bayshore",
  "Bedok North",
  "Bedok Reservoir",
  "Bedok South",
  "Frankel",
  "Kaki Bukit",
  "Kembangan",
  "Siglap",
  "Changi",
  "Changi Airport",
  "Changi Point",
  "Changi West",
  "Changi Bay",
  "Pasir Ris",
  "Flora Drive",
  "Loyang East",
  "Loyang West",
  'Pasir Ris Central (Formerly called "Town" subzone.)',
  "Pasir Ris Drive",
  "Pasir Ris Park",
  'Pasir Ris Wafer Fab Park (Formerly called "Pasir Ris West" subzone.)',
  'Pasir Ris West (Formerly called "Elias" subzone.)',
  "Paya Lebar",
  "Airport Road",
  "Paya Lebar East",
  "Paya Lebar North",
  "Paya Lebar West",
  "PLAB",
  "Tampines",
  "Simei",
  "Tampines East",
  "Tampines North",
  "Tampines West",
  "Xilin",
  "Central Water Catchment",
  "Lim Chu Kang",
  "Mandai",
  "Mandai East",
  "Mandai Estate",
  "Mandai West",
  "Sembawang",
  "Admiralty",
  "Sembawang Central",
  "Sembawang East",
  "Sembawang North",
  "Sembawang Spring",
  "Sembawang Straits",
  "Senoko North",
  "Senoko South",
  "The Wharves",
  "Simpang",
  "Pulau Seletar",
  "Simpang North",
  "Simpang South",
  "Tanjong Irau",
  "Sungei Kadut",
  'Gali Batu (Formerly called "Mandai" subzone.)',
  "Kranji",
  "Pang Sua",
  "Reservoir View",
  "Turf Club",
  "Woodlands",
  "Greenwood Park",
  "Midview",
  "North Coast",
  "Senoko West",
  "Woodgrove",
  "Woodlands East",
  "Woodlands Regional Centre",
  "Woodlands South",
  "Woodlands West",
  "Yishun",
  "Khatib",
  "Lower Seletar",
  "Nee Soon",
  "Northland",
  "Springleaf",
  "Yishun Central",
  "Yishun East",
  "Yishun South",
  "Yishun West",
  "Ang Mo Kio",
  "Ang Mo Kio Town Centre",
  "Cheng San",
  "Chong Boon",
  'Kebun Baru (Also spelled as "Kebun Bahru".)',
  "Sembawang Hills",
  "Shangri-La",
  'Tagore (Formerly called "Sindo" subzone.)',
  "Townsville",
  "Yio Chu Kang",
  "Yio Chu Kang East",
  'Yio Chu Kang North (Formerly called "Seletar" subzone.)',
  "Yio Chu Kang West",
  "Hougang",
  "Defu Industrial Park",
  "Hougang Central",
  "Hougang East",
  "Hougang West",
  "Kangkar",
  'Kovan (Formerly called "Rosyth" subzone.)',
  "Lorong Ah Soo",
  'Lorong Halus (Formerly called "Sungei Serangoon" subzone.)',
  'Tai Seng (Formerly called "Tai Keng" subzone.)',
  "Trafalgar",
  "North-Eastern Islands",
  "Punggol",
  "Coney Island",
  "Matilda",
  "Northshore",
  "Punggol Canal",
  "Punggol Field",
  "Punggol Town Centre",
  "Waterway East",
  "Seletar",
  "Pulau Punggol Barat",
  "Pulau Punggol Timor",
  "Seletar (Not to be confused with Seletar planning area.)",
  "Seletar Aerospace Park",
  "Sengkang",
  'Anchorvale (Formerly called "Buangkok" subzone.)',
  'Compassvale (Formerly called "Trafalgar" subzone.)',
  'Fernvale (Formerly called "Jalan Kayu East" subzone.)',
  'Lorong Halus North (Formerly called "Sungei Serangoon East" subzone.)',
  'Rivervale (Formerly called "Sungei Serangoon West" subzone.)',
  "Sengkang Town Centre",
  'Sengkang West (Formerly called "Jalan Kayu West" subzone.)',
  "Serangoon",
  "Lorong Chuan",
  "Seletar Hills",
  "Serangoon Central",
  "Serangoon Garden",
  "Serangoon North",
  "Serangoon North Industrial Estate",
  "Upper Paya Lebar",
  "Boon Lay",
  "Liu Fang",
  "Samulun",
  "Shipyard",
  "Tukang",
  "Bukit Batok",
  "Brickworks",
  "Bukit Batok Central",
  "Bukit Batok East",
  "Bukit Batok West",
  "Gombak",
  "Guilin",
  "Hillview",
  "Hong Kah North",
  "Bukit Panjang",
  "Bangkit",
  "Dairy Farm",
  "Fajar",
  "Jelebu",
  "Nature Reserve",
  "Saujana",
  "Senja",
  "Choa Chu Kang",
  "Choa Chu Kang Central",
  'Choa Chu Kang North (Formerly called "Kranji North" subzone and "Pang Sua" subzone.)',
  "Keat Hong",
  "Peng Siang",
  "Teck Whye",
  "Yew Tee",
  "Clementi",
  "Clementi Central",
  "Clementi North",
  "Clementi West",
  "Clementi Woods",
  "Faber",
  "Pandan",
  "Sunset Way",
  "Toh Tuck",
  "West Coast",
  "Jurong East",
  "International Business Park",
  'Jurong Gateway (Formerly called "Regional Centre" subzone.)',
  "Jurong Port",
  "Jurong River",
  "Lakeside",
  "Penjuru Crescent",
  "Teban Gardens",
  "Toh Guan",
  "Yuhua East",
  'Yuhua West (Formerly called "Boon Lay" subzone.)',
  "Jurong West",
  "Boon Lay Place",
  "Chin Bee",
  "Hong Kah",
  "Jurong West Central",
  "Kian Teck",
  "Safti",
  "Taman Jurong",
  "Wenya",
  "Yunnan",
  "Pioneer",
  "Benoi Sector",
  "Gul Basin",
  "Gul Circle",
  "Joo Koon",
  "Pioneer Sector",
  "Tengah",
  "Tuas",
  "Tengeh",
  "Tuas Bay",
  'Tuas North (Formerly called "Pioneer" subzone.)',
  "Tuas Promenade",
  "Tuas View",
  'Tuas View Extension (Formerly called "Coast" subzone.)',
  "Western Islands",
  "Jurong Island and Bukom",
  "Semakau",
  "Sudong",
  "Western Water Catchment",
]

print(f"Total number of locations within Singapore: {len(locations)}")

Total number of locations within Singapore: 319


In [10]:
# retrieving restaurant data from all locations

data = []

pbar = tqdm(total = len(locations))
for location in locations[::-1]:
    data.extend(yelp_api.business_search(location))
    pbar.update(1)

pbar.close()    

print(f'Total number of restaurants extracted: {len(data)}')

100%|██████████| 319/319 [03:51<00:00,  1.38it/s]

Total number of restaurants extracted: 13962





In [11]:
# remove duplicates and nested dictionary
seen = []
errors = []
clean_data = []

pbar = tqdm(total = len(data))
for index, entry in enumerate(data):
    pbar.update(1)

    try:
        entry_id = entry['id']
    except:
        errors.append(index)
        continue
    
    if entry_id in seen:
        continue
    else:
        seen.append(entry_id)

    # extracting relevant categories (with key 'alias')
    new_categories = []
    try:
        for category in entry['categories']:
            for key, value in category.items():
                if key == "alias":
                    new_categories.append(value)
        # for key, value in entry['categories'][0].items():
        #     new_categories.append(value)
            # if key == 'alias':
            #     new_categories.append(value)
        entry['categories'] = new_categories
    except:
        entry['categories'] = []

    # reformatting coordinates
    entry['coordinates'] = [entry['coordinates']['latitude'], entry['coordinates']['longitude']]

    # reformatting location
    try:
        entry['location'] = entry['location']['address1']
    except:
        entry['location'] = ''

    # appending to new data
    clean_data.append(entry)

pbar.close()

print(f'Total number of errors found: {len(errors)}')
print(f'New number of entries in dataset: {len(clean_data)}')



100%|██████████| 13962/13962 [00:00<00:00, 17305.86it/s]

Total number of errors found: 1
New number of entries in dataset: 2645





In [12]:
# storing the data

df = pd.DataFrame(clean_data)

df.to_csv('../data/resturants_singapore_yelp.csv', index=False)