In [32]:
import pickle
import numpy as np
import pandas as pd
import pygeohash as pgh
import matplotlib.pyplot as plt
import reverse_geocoder as rg
from datetime import datetime as dt

# Detecting Seasonal Patterns in User Movement

TODO

## Loading and cleaning the data

TODO

In [5]:
# The datasets have no headers, so we have to name them for clarity
checkin_header = ['user', 'checkin_time', 'latitude', 'longitude', 'location_id']
edges_header = ['user1', 'user2']

# Load the data by specifying the correct compression algorithm, separator and column names
checkin_brightkite_orig = pd.read_csv('data/loc-brightkite_totalCheckins.txt.gz', compression = 'gzip', sep = '\t', names = checkin_header)
edges_brightkite_orig = pd.read_csv('data/loc-brightkite_edges.txt.gz', compression = 'gzip', sep = '\t', names = edges_header)
checkin_gowalla_orig = pd.read_csv('data/loc-gowalla_totalCheckins.txt.gz', compression = 'gzip', sep = '\t', names = checkin_header)
edges_gowalla_orig = pd.read_csv('data/loc-gowalla_edges.txt.gz', compression = 'gzip', sep = '\t', names = edges_header)

In [6]:
# Copy the dataframe in order not to have to reload the original one in case of a mistake
checkin_gowalla = checkin_gowalla_orig.copy()
to_drop = []
# Select indexes to drop depending on conditions mentionned above 
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['latitude'] < -90.0) | 
                               (checkin_gowalla_orig['latitude'] > 90.0)].index)
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['longitude'] < -180.0) | 
                               (checkin_gowalla_orig['longitude'] > 180.0)].index)
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['latitude'] == 0) & 
                               (checkin_gowalla_orig['longitude'] == 0)].index)
# Drop the invalid rows in place from the copied dataframe
for item in to_drop:
    checkin_gowalla.drop(item, inplace=True)

In [7]:
# Repeat the process on the other data set
checkin_brightkite = checkin_brightkite_orig.copy()
to_drop = []
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['latitude'] < -90.0) | 
                                  (checkin_brightkite_orig['latitude'] > 90.0)].index)
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['longitude'] < -180.0) | 
                                  (checkin_brightkite_orig['longitude'] > 180.0)].index)
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['latitude'] == 0) & 
                                  (checkin_brightkite_orig['longitude'] == 0)].index)
for item in to_drop:
    checkin_brightkite.drop(item, inplace = True)
# Drop rows with NaN values
checkin_brightkite.dropna(inplace = True)

## Adding country information to check-ins

TODO

In [8]:
def coordinates_to_country(df):
    """
    Maps the coordinates of a dataframe to the country
    code using reverse geocoding
    """
    coordinates_tuples = list(df.apply(lambda x: (x['latitude'], x['longitude']), axis = 1))
    geocodes = rg.search(coordinates_tuples)
    return [gc['cc'] for gc in geocodes]

In [9]:
# Append country codes to each check-in for both datasets
checkin_brightkite['cc'] = coordinates_to_country(checkin_brightkite)
checkin_gowalla['cc'] = coordinates_to_country(checkin_gowalla)

Loading formatted geocoded file...


## Determining the location of user homes

TODO

In [10]:
# Compute the geohash for each checkin entry: that will determine the cell in which each checkin belongs
checkin_brightkite['geohash'] = checkin_brightkite.apply(lambda row: pgh.encode(row.latitude, row.longitude, precision=4) , axis = 1)
checkin_gowalla['geohash'] = checkin_gowalla.apply(lambda row: pgh.encode(row.latitude, row.longitude, precision=4) , axis = 1)

KeyboardInterrupt: 

In [None]:
brightkite_users = set(checkin_brightkite.user) # All the users that made at least 1 checkin
brightkite_per_user = checkin_brightkite.groupby(['user']) # Group the checkins by user

# Group the checkins by user and by geohash to be able to retrieve the most common geohash per user
brightkite_checkin_cells = checkin_brightkite.groupby(['user', 'geohash']).size() 

brightkite_homes = {} # To associate each user ID to its inferred home location
for user in brightkite_users:
    geohash = brightkite_checkin_cells[user].idxmax() # Get the most common geohash for the user
    group = brightkite_per_user.get_group(user) # Retrieve all the user's checkins
    
    # Keep all checkins that happened in the most common geohash and compute average position
    brightkite_homes[user] = group[group['geohash'] == geohash][['latitude', 'longitude']].mean()

# Create a dataframe containing the home location for each user 
brightkite_homes = pd.DataFrame.from_dict(brightkite_homes, orient = 'index')

# Add country code information for home locations 
brightkite_homes['cc'] = coordinates_to_country(brightkite_homes)

In [None]:
gowalla_users = set(checkin_gowalla.user) 
gowalla_per_user = checkin_gowalla.groupby(['user']) 

gowalla_checkin_cells = checkin_gowalla.groupby(['user', 'geohash']).size()

gowalla_homes = {} 
for user in gowalla_users:
    geohash = gowalla_checkin_cells[user].idxmax() 
    group = gowalla_per_user.get_group(user) 
    gowalla_homes[user] = group[group['geohash'] == geohash][['latitude', 'longitude']].mean()

gowalla_homes = pd.DataFrame.from_dict(gowalla_homes, orient = 'index')
gowalla_homes['cc'] = coordinates_to_country(gowalla_homes)

### Pickle for later use

In [None]:
pickle.dump(checkin_brightkite, open('pickles/checkin_brightkite', 'wb'))
pickle.dump(checkin_gowalla, open('pickles/checkin_gowalla', 'wb'))

In [25]:
# To reload for later
# pickle.load(open('pickles/checkin_brightkite', 'rb'))
# pickle.load(open('pickles/checkin_gowalla', 'rb'))

checkin_brightkite

Unnamed: 0,user,checkin_time,latitude,longitude,location_id,cc
0,0,2010-10-17T01:48:53Z,39.747652,-104.992510,88c46bf20db295831bd2d1718ad7e6f5,US
1,0,2010-10-16T06:02:04Z,39.891383,-105.070814,7a0f88982aa015062b95e3b4843f9ca2,US
2,0,2010-10-16T03:48:54Z,39.891077,-105.068532,dd7cd3d264c2d063832db506fba8bf79,US
3,0,2010-10-14T18:25:51Z,39.750469,-104.999073,9848afcc62e500a01cf6fbf24b797732f8963683,US
4,0,2010-10-14T00:21:47Z,39.752713,-104.996337,2ef143e12038c870038df53e0478cefc,US
...,...,...,...,...,...,...
4747282,58222,2009-01-23T02:30:34Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB
4747283,58224,2009-01-03T15:06:54Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB
4747284,58225,2009-01-20T13:58:14Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB
4747285,58226,2009-01-20T13:30:09Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB


In [38]:
checkin_brightkite['month'] = checkin_brightkite.apply(lambda row: dt.strptime(row['checkin_time'],
                                                                               '%Y-%m-%dT%H:%M:%SZ').month, axis=1)

In [84]:
## function that calculates the season of a given datetime object  

Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('winter', (datetime(Y,  1,  1),  datetime(Y,  3, 20))),
           ('spring', (datetime(Y,  3, 21),  datetime(Y,  6, 20))),
           ('summer', (datetime(Y,  6, 21),  datetime(Y,  9, 22))),
           ('autumn', (datetime(Y,  9, 23),  datetime(Y, 12, 20))),
           ('winter', (datetime(Y, 12, 21),  datetime(Y, 12, 31)))]

def get_season(row):
    
    date_time = dt.strptime(row['checkin_time'], '%Y-%m-%dT%H:%M:%SZ')
    date_time = date_time.replace(year=Y, hour=0, minute=0, second=0)
    
    return next(season for season, (start, end) in seasons
                if start <= date_time <= end)

In [85]:
checkin_brightkite['season'] = checkin_brightkite.apply(lambda row: get_season(row), axis=1)

In [86]:
checkin_brightkite

Unnamed: 0,user,checkin_time,latitude,longitude,location_id,cc,month,season
0,0,2010-10-17T01:48:53Z,39.747652,-104.992510,88c46bf20db295831bd2d1718ad7e6f5,US,10,autumn
1,0,2010-10-16T06:02:04Z,39.891383,-105.070814,7a0f88982aa015062b95e3b4843f9ca2,US,10,autumn
2,0,2010-10-16T03:48:54Z,39.891077,-105.068532,dd7cd3d264c2d063832db506fba8bf79,US,10,autumn
3,0,2010-10-14T18:25:51Z,39.750469,-104.999073,9848afcc62e500a01cf6fbf24b797732f8963683,US,10,autumn
4,0,2010-10-14T00:21:47Z,39.752713,-104.996337,2ef143e12038c870038df53e0478cefc,US,10,autumn
...,...,...,...,...,...,...,...,...
4747282,58222,2009-01-23T02:30:34Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB,1,winter
4747283,58224,2009-01-03T15:06:54Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB,1,winter
4747284,58225,2009-01-20T13:58:14Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB,1,winter
4747285,58226,2009-01-20T13:30:09Z,33.833333,35.833333,9f6b83bca22411dd85460384f67fcdb0,LB,1,winter


In [151]:
checkin_gowalla['month'] = checkin_gowalla.apply(lambda row: dt.strptime(row['checkin_time'],
                                                                               '%Y-%m-%dT%H:%M:%SZ').month, axis=1)

In [152]:
checkin_gowalla['season'] = checkin_gowalla.apply(lambda row: get_season(row), axis=1)

In [153]:
checkin_gowalla

Unnamed: 0,user,checkin_time,latitude,longitude,location_id,cc,month,season
0,0,2010-10-19T23:55:27Z,30.235909,-97.795140,22847,US,10,autumn
1,0,2010-10-18T22:17:43Z,30.269103,-97.749395,420315,US,10,autumn
2,0,2010-10-17T23:42:03Z,30.255731,-97.763386,316637,US,10,autumn
3,0,2010-10-17T19:26:05Z,30.263418,-97.757597,16516,US,10,autumn
4,0,2010-10-16T18:50:42Z,30.274292,-97.740523,5535878,US,10,autumn
...,...,...,...,...,...,...,...,...
6442887,196578,2010-06-11T13:32:26Z,51.742988,-0.488065,906885,GB,6,spring
6442888,196578,2010-06-11T13:26:45Z,51.746492,-0.490780,965121,GB,6,spring
6442889,196578,2010-06-11T13:26:34Z,51.741916,-0.496729,1174322,GB,6,spring
6442890,196585,2010-10-08T21:01:49Z,50.105516,8.571525,471724,DE,10,autumn


In [165]:
# number of days accounted for in each dataset, for normalizing 

# FOR BRIGHTKITE

# January: 62
# February: 56
# March: 73
# April: 90
# May: 93
# June: 90
# July: 93
# August: 93
# September: 90
# October: 80
# November: 60
# December: 62

# FOR GOWALLA

# January: 31
# February: 53
# March: 62
# April: 60
# May: 62
# June: 60
# July: 62
# August: 62
# September: 60
# October: 54
# November: 30
# December: 31

month_norm_brightkite = {1:62,2:56,3:73,4:90,5:93,6:90,7:93,8:93,9:90,10:80,11:60,12:62}

month_norm_gowalla = {1:31,2:53,3:62,4:60,5:62,6:60,7:62,8:62,9:60,10:54,11:30,12:31}


In [169]:
# these plots don't account for platform growth and may not prove too useful in the end

def plot_monthly_checkins(df,norms):

    grouped_by_month = df.groupby('month')
    month_freq = []
    
    for i, group in grouped_by_month:
        month_freq.append((group.count()/norms[i])['user'])

    months = [i+1 for i in range(12)]
    plt.bar(months, month_freq)
    
    return grouped_by_month.count()
    #plt.ylim([0, 8000])
    

def plot_seasonal_checkins(df,norms):
    
    grouped_by_season = df.groupby('season')
    season_freq = []
    
    for i, group in grouped_by_season:
        season_freq.append((group.count()/norms[i])['user'])
        
    seasons = [i+1 for i in range(4)]
    plt.bar(seasons, season_freq)