In [21]:
import pickle
import numpy as np
import pandas as pd
import pygeohash as pgh
import matplotlib.pyplot as plt
import reverse_geocoder as rg
from datetime import datetime

# Detecting Seasonal Patterns in User Movement

TODO

## Loading and cleaning the data

TODO

In [5]:
# The datasets have no headers, so we have to name them for clarity
checkin_header = ['user', 'checkin_time', 'latitude', 'longitude', 'location_id']
edges_header = ['user1', 'user2']

# Load the data by specifying the correct compression algorithm, separator and column names
checkin_brightkite_orig = pd.read_csv('data/loc-brightkite_totalCheckins.txt.gz', compression = 'gzip', sep = '\t', names = checkin_header)
edges_brightkite_orig = pd.read_csv('data/loc-brightkite_edges.txt.gz', compression = 'gzip', sep = '\t', names = edges_header)
checkin_gowalla_orig = pd.read_csv('data/loc-gowalla_totalCheckins.txt.gz', compression = 'gzip', sep = '\t', names = checkin_header)
edges_gowalla_orig = pd.read_csv('data/loc-gowalla_edges.txt.gz', compression = 'gzip', sep = '\t', names = edges_header)

In [6]:
# Copy the dataframe in order not to have to reload the original one in case of a mistake
checkin_gowalla = checkin_gowalla_orig.copy()
to_drop = []
# Select indexes to drop depending on conditions mentionned above 
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['latitude'] < -90.0) | 
                               (checkin_gowalla_orig['latitude'] > 90.0)].index)
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['longitude'] < -180.0) | 
                               (checkin_gowalla_orig['longitude'] > 180.0)].index)
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['latitude'] == 0) & 
                               (checkin_gowalla_orig['longitude'] == 0)].index)
# Drop the invalid rows in place from the copied dataframe
for item in to_drop:
    checkin_gowalla.drop(item, inplace=True)

In [7]:
# Repeat the process on the other data set
checkin_brightkite = checkin_brightkite_orig.copy()
to_drop = []
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['latitude'] < -90.0) | 
                                  (checkin_brightkite_orig['latitude'] > 90.0)].index)
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['longitude'] < -180.0) | 
                                  (checkin_brightkite_orig['longitude'] > 180.0)].index)
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['latitude'] == 0) & 
                                  (checkin_brightkite_orig['longitude'] == 0)].index)
for item in to_drop:
    checkin_brightkite.drop(item, inplace = True)
# Drop rows with NaN values
checkin_brightkite.dropna(inplace = True)

## Adding country information to check-ins

TODO

In [8]:
def coordinates_to_country(df):
    """
    Maps the coordinates of a dataframe to the country
    code using reverse geocoding
    """
    coordinates_tuples = list(df.apply(lambda x: (x['latitude'], x['longitude']), axis = 1))
    geocodes = rg.search(coordinates_tuples)
    return [gc['cc'] for gc in geocodes]

In [9]:
# Append country codes to each check-in for both datasets
checkin_brightkite['cc'] = coordinates_to_country(checkin_brightkite)
checkin_gowalla['cc'] = coordinates_to_country(checkin_gowalla)

Loading formatted geocoded file...


## Determining the location of user homes

TODO

In [10]:
# Compute the geohash for each checkin entry: that will determine the cell in which each checkin belongs
checkin_brightkite['geohash'] = checkin_brightkite.apply(lambda row: pgh.encode(row.latitude, row.longitude, precision=4) , axis = 1)
checkin_gowalla['geohash'] = checkin_gowalla.apply(lambda row: pgh.encode(row.latitude, row.longitude, precision=4) , axis = 1)

In [11]:
brightkite_users = set(checkin_brightkite.user) # All the users that made at least 1 checkin
brightkite_per_user = checkin_brightkite.groupby(['user']) # Group the checkins by user

# Group the checkins by user and by geohash to be able to retrieve the most common geohash per user
brightkite_checkin_cells = checkin_brightkite.groupby(['user', 'geohash']).size() 

brightkite_homes = {} # To associate each user ID to its inferred home location
for user in brightkite_users:
    geohash = brightkite_checkin_cells[user].idxmax() # Get the most common geohash for the user
    group = brightkite_per_user.get_group(user) # Retrieve all the user's checkins
    
    # Keep all checkins that happened in the most common geohash and compute average position
    brightkite_homes[user] = group[group['geohash'] == geohash][['latitude', 'longitude']].mean()

# Create a dataframe containing the home location for each user 
brightkite_homes = pd.DataFrame.from_dict(brightkite_homes, orient = 'index')

# Add country code information for home locations 
brightkite_homes['cc'] = coordinates_to_country(brightkite_homes)

In [15]:
checkin_gowalla = checkin_gowalla.head(100000)

In [16]:
gowalla_users = set(checkin_gowalla.user) 
gowalla_per_user = checkin_gowalla.groupby(['user']) 

gowalla_checkin_cells = checkin_gowalla.groupby(['user', 'geohash']).size()

gowalla_homes = {} 
for user in gowalla_users:
    geohash = gowalla_checkin_cells[user].idxmax() 
    group = gowalla_per_user.get_group(user) 
    gowalla_homes[user] = group[group['geohash'] == geohash][['latitude', 'longitude']].mean()

gowalla_homes = pd.DataFrame.from_dict(gowalla_homes, orient = 'index')
gowalla_homes['cc'] = coordinates_to_country(gowalla_homes)

### Pickle for later use

In [None]:
pickle.dump(checkin_brightkite, open('pickles/checkin_brightkite', 'wb'))
pickle.dump(checkin_gowalla, open('pickles/checkin_gowalla', 'wb'))

In [17]:
# To reload for later
# pickle.load(open('pickles/checkin_brightkite', 'rb'))
# pickle.load(open('pickles/checkin_gowalla', 'rb'))

checkin_brightkite

Unnamed: 0,user,checkin_time,latitude,longitude,location_id,cc,geohash
0,0,2010-10-17T01:48:53Z,39.747652,-104.992510,88c46bf20db295831bd2d1718ad7e6f5,US,9xj6
1,0,2010-10-16T06:02:04Z,39.891383,-105.070814,7a0f88982aa015062b95e3b4843f9ca2,US,9xj6
2,0,2010-10-16T03:48:54Z,39.891077,-105.068532,dd7cd3d264c2d063832db506fba8bf79,US,9xj6
3,0,2010-10-14T18:25:51Z,39.750469,-104.999073,9848afcc62e500a01cf6fbf24b797732f8963683,US,9xj6
4,0,2010-10-14T00:21:47Z,39.752713,-104.996337,2ef143e12038c870038df53e0478cefc,US,9xj6
...,...,...,...,...,...,...,...
4747282,58222,2009-01-23T02:30:34Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB,sy12
4747283,58224,2009-01-03T15:06:54Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB,sy12
4747284,58225,2009-01-20T13:58:14Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB,sy12
4747285,58226,2009-01-20T13:30:09Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB,sy12


In [None]:
checkin_brightkite['month'] = checkin_brightkite.apply(lambda row: dt.strptime(row['checkin_time'],
                                                                               '%Y-%m-%dT%H:%M:%SZ').month, axis=1)

In [22]:
## function that calculates the season of a given datetime object  

Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('winter', (datetime(Y,  1,  1),  datetime(Y,  3, 20))),
           ('spring', (datetime(Y,  3, 21),  datetime(Y,  6, 20))),
           ('summer', (datetime(Y,  6, 21),  datetime(Y,  9, 22))),
           ('autumn', (datetime(Y,  9, 23),  datetime(Y, 12, 20))),
           ('winter', (datetime(Y, 12, 21),  datetime(Y, 12, 31)))]

def get_season(row):
    
    date_time = dt.strptime(row['checkin_time'], '%Y-%m-%dT%H:%M:%SZ')
    date_time = date_time.replace(year=Y, hour=0, minute=0, second=0)
    
    return next(season for season, (start, end) in seasons
                if start <= date_time <= end)

In [23]:
checkin_brightkite['season'] = checkin_brightkite.apply(lambda row: get_season(row), axis=1)

KeyboardInterrupt: 

In [None]:
checkin_brightkite

In [24]:
checkin_gowalla['month'] = checkin_gowalla.apply(lambda row: dt.strptime(row['checkin_time'],
                                                                               '%Y-%m-%dT%H:%M:%SZ').month, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [25]:
checkin_gowalla['season'] = checkin_gowalla.apply(lambda row: get_season(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
checkin_gowalla

Unnamed: 0,user,checkin_time,latitude,longitude,location_id,cc,geohash,month,season
0,0,2010-10-19T23:55:27Z,30.235909,-97.795140,22847,US,9v6k,10,autumn
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315,US,9v6k,10,autumn
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637,US,9v6k,10,autumn
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516,US,9v6k,10,autumn
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878,US,9v6k,10,autumn
...,...,...,...,...,...,...,...,...,...
100001,340,2010-03-25T08:18:28Z,55.940199,-3.169638,768833,GB,gcvw,3,spring
100002,340,2010-03-25T07:35:23Z,55.943212,-3.183050,788696,GB,gcvw,3,spring
100003,340,2010-03-25T07:05:36Z,55.952546,-3.194323,783591,GB,gcvw,3,spring
100004,340,2010-03-24T19:33:01Z,55.949257,-3.182669,426909,GB,gcvw,3,spring


In [27]:
# Might need this
# month_norm_brightkite = {1:62,2:56,3:73,4:90,5:93,6:90,7:93,8:93,9:90,10:80,11:60,12:62}
# month_norm_gowalla = {1:31,2:53,3:62,4:60,5:62,6:60,7:62,8:62,9:60,10:54,11:30,12:31}

### Identify travelers

In [38]:
%%time
# Group by user
checkin_per_user = checkin_gowalla.groupby('user')
# Identify a user when the user has checked in to multple countries
traveler_check = lambda group: True if len(set(group['cc'])) > 1 else False

gowalla_homes['traveler'] = checkin_per_user.apply(traveler_check)

CPU times: user 37.5 ms, sys: 9 µs, total: 37.5 ms
Wall time: 37.4 ms
