In [1]:
import os, sys

import pandas as pd

STLARM_BASE_DIR = os.getenv('STLARM_BASE_DIR', '')
sys.path.append(STLARM_BASE_DIR)

from stlarm.utils import normalize_string_series

## Venue's category taxonomy

Create mapping from category's foursquare_id to category's root category name.

In [2]:
categories_df = pd.read_csv('fq_category.csv')
categories_df.head()

Unnamed: 0,id,parent_category,foursquare_id,name,plural_name,short_name,root_category
0,8,1.0,52e81612bcbc57f1066b79e7,Circus,Circuses,Circus,1
1,9,1.0,4bf58dd8d48988d18e941735,Comedy Club,Comedy Clubs,Comedy Club,1
2,10,1.0,5032792091d4c4b30a586d5c,Concert Hall,Concert Halls,Concert Hall,1
3,11,1.0,52e81612bcbc57f1066b79ef,Country Dance Club,Country Dance Clubs,Country Dance Club,1
4,12,1.0,52e81612bcbc57f1066b79e8,Disc Golf,Disc Golf Courses,Disc Golf,1


In [3]:
id_to_name = categories_df.set_index('id')['name'].to_dict()
id_to_foursquare_id = categories_df.set_index('id')['foursquare_id'].to_dict()

In [4]:
df = pd.concat([
    categories_df['foursquare_id'],
    categories_df['name'],
    categories_df['root_category'].map(id_to_foursquare_id),
    categories_df['root_category'].map(id_to_name)
], axis=1)
df.columns = ['category_id', 'category_name', 'root_category_id', 'root_category_name']
df

Unnamed: 0,category_id,category_name,root_category_id,root_category_name
0,52e81612bcbc57f1066b79e7,Circus,4d4b7104d754a06370d81259,Arts & Entertainment
1,4bf58dd8d48988d18e941735,Comedy Club,4d4b7104d754a06370d81259,Arts & Entertainment
2,5032792091d4c4b30a586d5c,Concert Hall,4d4b7104d754a06370d81259,Arts & Entertainment
3,52e81612bcbc57f1066b79ef,Country Dance Club,4d4b7104d754a06370d81259,Arts & Entertainment
4,52e81612bcbc57f1066b79e8,Disc Golf,4d4b7104d754a06370d81259,Arts & Entertainment
...,...,...,...,...
915,4bf58dd8d48988d12a951735,Train,4d4b7105d754a06379d81259,Travel & Transport
916,52f2ab2ebcbc57f1066b8b51,Tram Station,4d4b7105d754a06379d81259,Travel & Transport
917,54541b70498ea6ccd0204bff,Transportation Service,4d4b7105d754a06379d81259,Travel & Transport
918,4f04b25d2fb6e1c99f3db0c0,Travel Lounge,4d4b7105d754a06379d81259,Travel & Transport


In [5]:
import json
with open('category_id_to_root_category_name.json', 'w') as f:
    json.dump(df.set_index('category_id')['root_category_name'].to_dict(), f)

## Venue's data

Create mapping from venue's foursquare_id to a custom venue's id based on its name.

In [6]:
venues_df = pd.read_csv('fq_venue.csv', usecols=['foursquare_id', 'name']) # ignore other columns
venues_df.head()

Unnamed: 0,foursquare_id,name
0,4e5fce33b0fb754192cca549,Amtrak Waiting Area
1,4bec595cc43f2d7f3fc3dbd9,FedEx Office Print & Ship Center
2,4a9d8a2df964a520703820e3,US Open Tennis Championships
3,4e67954f52b1ca65dc3778fd,BAK | Korean Kitchen & Craft Beer
4,4b004499f964a520253c22e3,Target


In [7]:
# we have many non-ascii chars
print(venues_df['name'].sort_values().unique()[-20:])

['ñ' 'ΖΩΗ & COTA' 'ΦIA Mu Chapter' 'ΦIA Omicron Chapter'
 'ΦIA Sigma Chapter' 'Пельменный Цех'
 "Пол'с Place \uf1fa\uf1f8\uf1f7\uf1fa" '上海人家 Shanghai Family Dumpling'
 '冉 Ran Tea House' '天山羊莊小肥羊 Happy Family Hotpot Restaurant' '我的房间'
 '龍腾食坊 Wu Chinese Restaurant' '내 자신을 채찍질 하는 시간'
 '돼지쏜데이 (Pig Ssonday) (Pig Ssonday)' '불판' '사또통족발보쌈' '속편한내과' '주랑'
 '\ue11dHeLL\ue11d' '\ue328 Home Sweet Home \ue328']


#### Name pre-processing

In [8]:
# remove non-ascii, special chars, and spaces 
venues_df['name'] = normalize_string_series(venues_df['name'])
# add foursquare_id so we have unique names
venues_df['name'] = venues_df['name'].str.cat(venues_df['foursquare_id'], sep='_')

#### Save mapping

In [9]:
import json
with open('venue_id_to_name_id.json', 'w') as f:
    json.dump(venues_df.set_index('foursquare_id')['name'].to_dict(), f)