In [1]:
import pickle
import numpy as np
import pandas as pd
import pygeohash as pgh
import matplotlib.pyplot as plt
import reverse_geocoder as rg

# Detecting Seasonal Patterns in User Movement

TODO

## Loading and cleaning the data

TODO

In [2]:
# The datasets have no headers, so we have to name them for clarity
checkin_header = ['user', 'checkin_time', 'latitude', 'longitude', 'location_id']
edges_header = ['user1', 'user2']

# Load the data by specifying the correct compression algorithm, separator and column names
checkin_brightkite_orig = pd.read_csv('data/loc-brightkite_totalCheckins.txt.gz', compression = 'gzip', sep = '\t', names = checkin_header)
edges_brightkite_orig = pd.read_csv('data/loc-brightkite_edges.txt.gz', compression = 'gzip', sep = '\t', names = edges_header)
checkin_gowalla_orig = pd.read_csv('data/loc-gowalla_totalCheckins.txt.gz', compression = 'gzip', sep = '\t', names = checkin_header)
edges_gowalla_orig = pd.read_csv('data/loc-gowalla_edges.txt.gz', compression = 'gzip', sep = '\t', names = edges_header)

In [3]:
# Copy the dataframe in order not to have to reload the original one in case of a mistake
checkin_gowalla = checkin_gowalla_orig.copy()
to_drop = []
# Select indexes to drop depending on conditions mentionned above 
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['latitude'] < -90.0) | 
                               (checkin_gowalla_orig['latitude'] > 90.0)].index)
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['longitude'] < -180.0) | 
                               (checkin_gowalla_orig['longitude'] > 180.0)].index)
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['latitude'] == 0) & 
                               (checkin_gowalla_orig['longitude'] == 0)].index)
# Drop the invalid rows in place from the copied dataframe
for item in to_drop:
    checkin_gowalla.drop(item, inplace=True)

In [4]:
# Repeat the process on the other data set
checkin_brightkite = checkin_brightkite_orig.copy()
to_drop = []
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['latitude'] < -90.0) | 
                                  (checkin_brightkite_orig['latitude'] > 90.0)].index)
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['longitude'] < -180.0) | 
                                  (checkin_brightkite_orig['longitude'] > 180.0)].index)
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['latitude'] == 0) & 
                                  (checkin_brightkite_orig['longitude'] == 0)].index)
for item in to_drop:
    checkin_brightkite.drop(item, inplace = True)
# Drop rows with NaN values
checkin_brightkite.dropna(inplace = True)

## Adding country information to check-ins

TODO

In [5]:
def coordinates_to_country(df):
    """
    Maps the coordinates of a dataframe to the country
    code using reverse geocoding
    """
    coordinates_tuples = list(df.apply(lambda x: (x['latitude'], x['longitude']), axis = 1))
    geocodes = rg.search(coordinates_tuples)
    return [gc['cc'] for gc in geocodes]

In [6]:
# Append country codes to each check-in for both datasets
checkin_brightkite['cc'] = coordinates_to_country(checkin_brightkite)
checkin_gowalla['cc'] = coordinates_to_country(checkin_gowalla)

Loading formatted geocoded file...


## Determining the location of user homes

TODO

In [7]:
# Compute the geohash for each checkin entry: that will determine the cell in which each checkin belongs
checkin_brightkite['geohash'] = checkin_brightkite.apply(lambda row: pgh.encode(row.latitude, row.longitude, precision=4) , axis = 1)
checkin_gowalla['geohash'] = checkin_gowalla.apply(lambda row: pgh.encode(row.latitude, row.longitude, precision=4) , axis = 1)

In [None]:
brightkite_users = set(checkin_brightkite.user) # All the users that made at least 1 checkin
brightkite_per_user = checkin_brightkite.groupby(['user']) # Group the checkins by user

# Group the checkins by user and by geohash to be able to retrieve the most common geohash per user
brightkite_checkin_cells = checkin_brightkite.groupby(['user', 'geohash']).size() 

brightkite_homes = {} # To associate each user ID to its inferred home location
for user in brightkite_users:
    geohash = brightkite_checkin_cells[user].idxmax() # Get the most common geohash for the user
    group = brightkite_per_user.get_group(user) # Retrieve all the user's checkins
    
    # Keep all checkins that happened in the most common geohash and compute average position
    brightkite_homes[user] = group[group['geohash'] == geohash][['latitude', 'longitude']].mean()

# Create a dataframe containing the home location for each user 
brightkite_homes = pd.DataFrame.from_dict(brightkite_homes, orient = 'index')

# Add country code information for home locations 
brightkite_homes['cc'] = coordinates_to_country(brightkite_homes)

In [None]:
gowalla_users = set(checkin_gowalla.user) 
gowalla_per_user = checkin_gowalla.groupby(['user']) 

gowalla_checkin_cells = checkin_gowalla.groupby(['user', 'geohash']).size()

gowalla_homes = {} 
for user in gowalla_users:
    geohash = gowalla_checkin_cells[user].idxmax() 
    group = gowalla_per_user.get_group(user) 
    gowalla_homes[user] = group[group['geohash'] == geohash][['latitude', 'longitude']].mean()

gowalla_homes = pd.DataFrame.from_dict(gowalla_homes, orient = 'index')
gowalla_homes['cc'] = coordinates_to_country(gowalla_homes)

### Pickle for later use

In [8]:
pickle.dump(checkin_brightkite, open('pickles/checkin_brightkite', 'wb'))
pickle.dump(checkin_gowalla, open('pickles/checkin_gowalla', 'wb'))

In [None]:
# To reload for later
# pickle.load(open('pickles/checkin_brightkite', 'rb'))
# pickle.load(open('pickles/checkin_gowalla', 'rb'))