In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Path
DATA_FOLDER = './Data/'
NYC = DATA_FOLDER+'dataset_TSMC2014_NYC.txt'
TKY = DATA_FOLDER+'dataset_TSMC2014_TKY.txt'

# Loading data
nyc = pd.read_csv(NYC, header = None, sep='\t', engine='python',
                  names=['user_ID', 'venue_ID', 'venue_category_ID','venue_category_name',
                         'latitude', 'longitude', 'offset', 'UTC_time'])
tky = pd.read_csv(TKY, header = None, sep='\t', engine='python',
                  names=['user_ID', 'venue_ID', 'venue_category_ID','venue_category_name',
                         'latitude', 'longitude', 'offset', 'UTC_time'])

In [3]:
display(nyc.head(3))
display(tky.head(3))

Unnamed: 0,user_ID,venue_ID,venue_category_ID,venue_category_name,latitude,longitude,offset,UTC_time
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.71981,-74.002581,-240,Tue Apr 03 18:00:09 +0000 2012
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.6068,-74.04417,-240,Tue Apr 03 18:00:25 +0000 2012
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.88307,-240,Tue Apr 03 18:02:24 +0000 2012


Unnamed: 0,user_ID,venue_ID,venue_category_ID,venue_category_name,latitude,longitude,offset,UTC_time
0,1541,4f0fd5a8e4b03856eeb6c8cb,4bf58dd8d48988d10c951735,Cosmetics Shop,35.705101,139.61959,540,Tue Apr 03 18:17:18 +0000 2012
1,868,4b7b884ff964a5207d662fe3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,35.715581,139.800317,540,Tue Apr 03 18:22:04 +0000 2012
2,114,4c16fdda96040f477cc473a5,4d954b0ea243a5684a65b473,Convenience Store,35.714542,139.480065,540,Tue Apr 03 19:12:07 +0000 2012


### Converting longitude and latitude into km values

Similar to the Friendship and Mobility paper, we are estimating the home location of a user based on their check-in locations. This is accomplished by dividing our two cities into a grid, of size N by N km. N is the length of one unit in the grid and can be considered to be a design parameter. 

This is done by using the following conversions:

Latitude 1°: 110.574 km  
Longtiude: 1°: 111.320*cos(latitude) km

In [8]:
# The grid cells are discrete so we are rounding to the closest cell

def lat2km(lat, N):
    # Convert a latitude value to a 25 km grid
    km = np.round((110.574*lat)/N)
    return km
    
def long2km(long, lat, N):
    # Convert a longitude value to a 25 km grid 
    km = np.round((110.320 * long * np.cos(np.deg2rad(lat)))/N)
    return km
                  

In [30]:
N = 0.5
nyc['x_grid'] = long2km(nyc.longitude, nyc.latitude, N)
nyc['y_grid'] = lat2km(nyc.latitude, N)

tky['x_grid'] = long2km(tky.longitude, tky.latitude, N)
tky['y_grid'] = lat2km(tky.latitude, N)

In [31]:
# # The home locations can be obtained by counting the most common x and y cells for each user
# # This is done by storing the mode of each user's check-in cell's x and y coordinates.
nyc_homes = pd.DataFrame()
nyc_homes['x_cord'] = nyc.groupby(['user_ID'])['x_grid'].apply(pd.Series.mode)
nyc_homes['y_cord'] = nyc.groupby(['user_ID'])['y_grid'].apply(pd.Series.mode)
nyc_homes = nyc_homes.reset_index().drop(['level_1'], axis=1) # get rid of multiindex

tky_homes = pd.DataFrame()
tky_homes['x_cord'] = tky.groupby(['user_ID'])['x_grid'].apply(pd.Series.mode)
tky_homes['y_cord'] = tky.groupby(['user_ID'])['y_grid'].apply(pd.Series.mode)
tky_homes = tky_homes.reset_index().drop(['level_1'], axis=1)

##### __Motivation for getting estimated home longitude and latitude__
The dataframes for the homes have the x and y coordinates for the cell within the grid with the most check-ins. We estimate the home position by using the average longitude and latitude for check-ins within this cell for each user. 

In [32]:
# Combine the dataframes by merging on multiple columns; this keeps only the checkins in the 'home' cell. 
nyc_homes = pd.merge(nyc_homes, nyc, how = 'left', left_on=['user_ID', 'x_cord', 'y_cord'], right_on=['user_ID', 'x_grid', 'y_grid'])

# Obtain the mean longtitude and latitude for the users' estimated home location
nyc_homes = nyc_homes.groupby('user_ID')[['longitude', 'latitude']].mean()

tky_homes = pd.merge(tky_homes, tky, how = 'left', left_on=['user_ID', 'x_cord', 'y_cord'], right_on= ['user_ID', 'x_grid', 'y_grid'])
tky_homes = tky_homes.groupby('user_ID')[['longitude', 'latitude']].mean()

In [33]:
display(nyc_homes.head(3))
display(tky_homes.head(3))

Unnamed: 0_level_0,longitude,latitude
user_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-74.004165,40.719994
2,-73.913707,40.646648
3,-73.988551,40.745732


Unnamed: 0_level_0,longitude,latitude
user_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,139.773418,35.698775
2,139.698197,35.658062
3,139.600839,35.668477


In [None]:
# Calculate distance between two pairs of long/lat coordinates. 
# Solution from: https://stackoverflow.com/questions/29545704/fast-haversine-approximation-python-pandas/29546836#29546836

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km
