# Data Collecting & Pre-processing

In [1]:
import pandas as pd
import numpy as np
import random as rd

## K Nearest Neighbors

We use neareast neighbors algorithm to determine the city of each venue with its latitude and longitude

start by reading File dataset_TIST2015_Cities.txt contains all 415 cities data with 6 columns, which are:
1. City name
2. Latitude (of City center)
3. Longitude (of City center)
4. Country code (ISO 3166-1 alpha-2 two-letter country codes)
5. Country name
6. City type (e.g., national capital, provincial capital)

In [2]:
cities = pd.read_csv('data/dataset_TIST2015/dataset_TIST2015_Cities.txt', sep='\t', names=['city','lat',
                                                                'long','country_code','country_name','type'])
cities_us = cities[cities.country_code == 'US'] 
cities_us.shape

(60, 6)

In [3]:
# Get coordinates
coordinates = [[lat,long] for lat,long in zip(cities_us['lat'],cities_us['long'])]
coordinates[0:2]

[[29.956383000000002, -90.098694], [29.771828999999997, -95.40711]]

In [4]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=1)
knn.fit(coordinates)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=1, p=2, radius=1.0)

In [5]:
def get_city_name(latitude,longitude):
    nn = knn.kneighbors([[latitude,longitude]],return_distance = False)[0]
    return cities_us.iloc[nn]['city']

## Get US venues data set 

In [6]:
#loading users data set
#DataFrame.from_csv('c:/~/trainSetRel3.txt', sep='\t')
df_venues = pd.read_csv('data/dataset_TIST2015/dataset_TIST2015_POIs.txt', sep='\t', names=['venue_id','lat','long', 'category','country_code'])
print('Nb of rows ',df_venues.shape[0])
#we keep only the venues from US
df_venues_us = df_venues[df_venues.country_code == 'US']
print('Number of venues in US : ' + str(df_venues_us.shape[0]))
df_venues_us.head()

Nb of rows  3680126
Number of venues in US : 501900


Unnamed: 0,venue_id,lat,long,category,country_code
0,3fd66200f964a52000e71ee3,40.733596,-74.003139,Jazz Club,US
1,3fd66200f964a52000e81ee3,40.758102,-73.975734,Gym,US
2,3fd66200f964a52000ea1ee3,40.732456,-74.003755,Indian Restaurant,US
3,3fd66200f964a52000ec1ee3,42.345907,-71.087001,Indian Restaurant,US
4,3fd66200f964a52000ee1ee3,39.933178,-75.159262,Sandwich Place,US


### Add cities

In [7]:
cities = [get_city_name(lat,long).iloc[0] for lat, long in zip(df_venues_us['lat'],df_venues_us['long'])]
df_venues_us['city'] = cities

501900

### Get Chicago venues

In [11]:
df_venues_chicago = df_venues_us[df_venues_us.city == 'Chicago']
print(df_venues_chicago.shape)
df_venues_chicago.head()

(21949, 6)


Unnamed: 0,venue_id,lat,long,category,country_code,city
12,3fd66200f964a52002ee1ee3,41.941562,-87.664011,Pub,US,Chicago
67,3fd66200f964a5200cee1ee3,41.939824,-87.663895,Asian Restaurant,US,Chicago
73,3fd66200f964a5200dee1ee3,41.935361,-87.662638,Steakhouse,US,Chicago
91,3fd66200f964a52010f11ee3,41.919532,-87.677331,Bar,US,Chicago
129,3fd66200f964a52016ee1ee3,41.938006,-87.671027,Bar,US,Chicago


### Get New York venues

In [12]:
df_venues_nyc = df_venues_us[df_venues_us.city == 'New York']
print(df_venues_nyc.shape)
df_venues_nyc.head()

(33680, 6)


Unnamed: 0,venue_id,lat,long,category,country_code,city
1,3fd66200f964a52000e81ee3,40.758102,-73.975734,Gym,US,New York
7,3fd66200f964a52001e81ee3,40.756353,-73.967676,Bar,US,New York
16,3fd66200f964a52003e81ee3,40.756119,-73.972532,Hotel Bar,US,New York
21,3fd66200f964a52004e81ee3,40.756048,-73.967762,Pub,US,New York
26,3fd66200f964a52005e81ee3,40.756161,-73.967686,Pub,US,New York


### Get Boston venues

In [13]:
df_venues_boston = df_venues_us[df_venues_us.city == 'Boston']
print(df_venues_boston.shape)
df_venues_boston.head()

(14445, 6)


Unnamed: 0,venue_id,lat,long,category,country_code,city
3,3fd66200f964a52000ec1ee3,42.345907,-71.087001,Indian Restaurant,US,Boston
17,3fd66200f964a52003ec1ee3,42.346127,-71.080363,French Restaurant,US,Boston
48,3fd66200f964a52008ec1ee3,42.348212,-71.085207,Mexican Restaurant,US,Boston
60,3fd66200f964a5200aec1ee3,42.251665,-71.037348,French Restaurant,US,Boston
63,3fd66200f964a5200bec1ee3,42.348751,-71.083938,Middle Eastern Restaurant,US,Boston


## Create User Profile Du, wich is a set of four tuples (i.e u,v,lv,cv)

In [14]:
df_checkins = pd.read_csv('./data/dataset_TIST2015/dataset_TIST2015_Checkins.txt',sep='\t', names=['user_id','venue_id','utc','timezone'])
print(df_checkins.shape)
df_checkins.head()

(33263633, 4)


Unnamed: 0,user_id,venue_id,utc,timezone
0,50756,4f5e3a72e4b053fd6a4313f6,Tue Apr 03 18:00:06 +0000 2012,240
1,190571,4b4b87b5f964a5204a9f26e3,Tue Apr 03 18:00:07 +0000 2012,180
2,221021,4a85b1b3f964a520eefe1fe3,Tue Apr 03 18:00:08 +0000 2012,-240
3,66981,4b4606f2f964a520751426e3,Tue Apr 03 18:00:08 +0000 2012,-300
4,21010,4c2b4e8a9a559c74832f0de2,Tue Apr 03 18:00:09 +0000 2012,240


### User profiles Chicago

In [18]:
print('User profiles Chicago : ' )
user_profiles_chicago = df_checkins[['user_id','venue_id']].merge(
    df_venues_chicago[['venue_id','category','city']], how='inner', on='venue_id')
print('Nb of ui (rows) in user_profile : ', user_profiles_chicago.shape[0])
user_profiles_chicago = user_profiles_chicago.drop_duplicates()
print('After droping duplicates...')
print('Nb of ui (rows) in user_profile : ', user_profiles_chicago.shape[0])
print('\nNb of unique users : ', len(np.unique(user_profiles_chicago['user_id'])))
user_profiles_chicago.head()

User profiles Chicago : 
Nb of ui (rows) in user_profile :  184873
After droping duplicates...
Nb of ui (rows) in user_profile :  95464

Nb of unique users :  6885


Unnamed: 0,user_id,venue_id,category,city
0,163570,4b2277b1f964a5203f4724e3,Conference Room,Chicago
2,15134,4b2277b1f964a5203f4724e3,Conference Room,Chicago
4,44228,4a95f126f964a520952520e3,Train Station,Chicago
5,114729,4a95f126f964a520952520e3,Train Station,Chicago
7,64372,4a95f126f964a520952520e3,Train Station,Chicago


### User profiles New York

In [19]:
print('User profiles New York : ' )
user_profiles_nyc = df_checkins[['user_id','venue_id']].merge(
    df_venues_nyc[['venue_id','category','city']], how='inner', on='venue_id')
print('Nb of ui (rows) in user_profile : ', user_profiles_nyc.shape[0])
user_profiles_nyc = user_profiles_nyc.drop_duplicates()
print('After droping duplicates...')
print('Nb of ui (rows) in user_profile : ', user_profiles_nyc.shape[0])
print('\nNb of unique users : ', len(np.unique(user_profiles_nyc['user_id'])))
user_profiles_nyc.head()

User profiles New York : 
Nb of ui (rows) in user_profile :  279827
After droping duplicates...
Nb of ui (rows) in user_profile :  147281

Nb of unique users :  14516


Unnamed: 0,user_id,venue_id,category,city
0,49932,4b1d4b75f964a520560e24e3,Department Store,New York
1,46376,4b1d4b75f964a520560e24e3,Department Store,New York
2,20494,4b1d4b75f964a520560e24e3,Department Store,New York
3,134066,4b1d4b75f964a520560e24e3,Department Store,New York
4,97899,4b1d4b75f964a520560e24e3,Department Store,New York


### User profiles Boston

In [20]:
print('User profiles Boston : ' )
user_profiles_boston = df_checkins[['user_id','venue_id']].merge(
    df_venues_boston[['venue_id','category','city']], how='inner', on='venue_id')
print('Nb of ui (rows) in user_profile : ', user_profiles_boston.shape[0])
user_profiles_boston = user_profiles_boston.drop_duplicates()
print('After droping duplicates...')
print('Nb of ui (rows) in user_profile : ', user_profiles_boston.shape[0])
print('\nNb of unique users : ', len(np.unique(user_profiles_boston['user_id'])))
user_profiles_boston.head()

User profiles Boston : 
Nb of ui (rows) in user_profile :  104228
After droping duplicates...
Nb of ui (rows) in user_profile :  55315

Nb of unique users :  4595


Unnamed: 0,user_id,venue_id,category,city
0,180962,4b3be5b9f964a520e37d25e3,Bar,Boston
1,38722,4b3be5b9f964a520e37d25e3,Bar,Boston
5,93711,4b3be5b9f964a520e37d25e3,Bar,Boston
6,68294,4b3be5b9f964a520e37d25e3,Bar,Boston
7,101835,4b3be5b9f964a520e37d25e3,Bar,Boston


### User profiles : get a sample

In [26]:

nb_of_users = 4500
# Select a sample of random users from each city :
# There is less checkin per users in Chicago, so we take more users to palliate data sparsity
sample_boston = list(rd.sample(list(np.unique(user_profiles_boston['user_id'])),nb_of_users))
sample_chi = list(rd.sample(list(np.unique(user_profiles_chicago['user_id'])), nb_of_users))
sample_nyc = list(rd.sample(list(np.unique(user_profiles_nyc['user_id'])),nb_of_users))

print('sample_boston : ', len(sample_boston))
print('sample_chi : ', len(sample_chi))
print('sample_nyc : ', len(sample_nyc))

sample_boston :  4500
sample_chi :  4500
sample_nyc :  4500


In [27]:
# Gather all three dataframe together
df_user_profiles = pd.concat(
    [user_profiles_boston[user_profiles_boston.user_id.isin(sample_boston)],
    user_profiles_chicago[user_profiles_chicago.user_id.isin(sample_chi)],
    user_profiles_nyc[user_profiles_nyc.user_id.isin(sample_nyc)]], ignore_index = True
)
print(df_user_profiles.shape)
df_user_profiles.head()

(161468, 4)


Unnamed: 0,user_id,venue_id,category,city
0,180962,4b3be5b9f964a520e37d25e3,Bar,Boston
1,38722,4b3be5b9f964a520e37d25e3,Bar,Boston
2,93711,4b3be5b9f964a520e37d25e3,Bar,Boston
3,68294,4b3be5b9f964a520e37d25e3,Bar,Boston
4,101835,4b3be5b9f964a520e37d25e3,Bar,Boston


In [50]:
df_user_profiles.to_csv('data/df_user_profiles_us.csv', index=False)

## Data analysis and wranlging other user profile dataset 

### Number of venues per user

In [45]:
chi_mean_per_user = df_user_profiles[df_user_profiles.city 
                                     == 'Chicago'][['user_id','venue_id']].groupby(['user_id']).count().mean().iloc[0]
nyc_mean_per_user = df_user_profiles[df_user_profiles.city 
                                     == 'New York'][['user_id','venue_id']].groupby(['user_id']).count().mean().iloc[0]
bos_mean_per_user = df_user_profiles[df_user_profiles.city 
                                     == 'Boston'][['user_id','venue_id']].groupby(['user_id']).count().mean().iloc[0]


print("Number of venues per user in Chicago : ", chi_mean_per_user)
print("Number of venues per user in Boston : ", bos_mean_per_user)
print("Number of venues per user in New York : ", nyc_mean_per_user)


Number of venues per user in Chicago :  13.627333333333333
Number of venues per user in Boston :  12.062
Number of venues per user in New York :  10.192444444444444


### Travelling users

In [48]:
# keep a list of user id that have been visited both Nyc and Chicago
list_nyc_chi = np.unique(list(df_user_profiles[df_user_profiles.city == 'Chicago'].merge(
    df_user_profiles[df_user_profiles.city == 'New York'][['user_id']], on='user_id',how='inner')['user_id']))

# keep a list of user id that have been visited both Nyc and Chicago
list_nyc_bos = np.unique(list(df_user_profiles[df_user_profiles.city == 'New York'].merge(
    df_user_profiles[df_user_profiles.city == 'Boston'][['user_id']], on='user_id',how='inner')['user_id']))

# keep a list of user id that have been visited both boston and Chicago
list_chi_bos = np.unique(list(df_user_profiles[df_user_profiles.city == 'Boston'].merge(
    df_user_profiles[df_user_profiles.city == 'Chicago'][['user_id']], on='user_id',how='inner')['user_id']))

print(len(list_nyc_chi), 'users traveled in both Chicago and New York City')
print(len(list_nyc_bos), 'users traveled in both Boston and New York City')
print(len(list_chi_bos), 'users traveled in both Boston and Chicago')

415 users traveled in both Chicago and New York City
716 users traveled in both Boston and New York City
586 users traveled in both Boston and Chicago
