In [76]:
import pickle
import numpy as np
import pandas as pd
import pygeohash as pgh
import matplotlib.pyplot as plt
import reverse_geocoder as rg
from datetime import datetime

In [106]:
from tqdm import tqdm_notebook
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Detecting Seasonal Patterns in User Movement

TODO

## Loading and cleaning the data

TODO

In [90]:
# The datasets have no headers, so we have to name them for clarity
checkin_header = ['user', 'checkin_time', 'latitude', 'longitude', 'location_id']
edges_header = ['user1', 'user2']

# Load the data by specifying the correct compression algorithm, separator and column names
checkin_brightkite_orig = pd.read_csv('data/loc-brightkite_totalCheckins.txt.gz', compression = 'gzip', sep = '\t', names = checkin_header)
edges_brightkite_orig = pd.read_csv('data/loc-brightkite_edges.txt.gz', compression = 'gzip', sep = '\t', names = edges_header)
checkin_gowalla_orig = pd.read_csv('data/loc-gowalla_totalCheckins.txt.gz', compression = 'gzip', sep = '\t', names = checkin_header)
edges_gowalla_orig = pd.read_csv('data/loc-gowalla_edges.txt.gz', compression = 'gzip', sep = '\t', names = edges_header)

In [91]:
# Copy the dataframe in order not to have to reload the original one in case of a mistake
checkin_gowalla = checkin_gowalla_orig.copy()
to_drop = []
# Select indexes to drop depending on conditions mentionned above 
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['latitude'] < -90.0) | 
                               (checkin_gowalla_orig['latitude'] > 90.0)].index)
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['longitude'] < -180.0) | 
                               (checkin_gowalla_orig['longitude'] > 180.0)].index)
to_drop.append(checkin_gowalla_orig[(checkin_gowalla_orig['latitude'] == 0) & 
                               (checkin_gowalla_orig['longitude'] == 0)].index)
# Drop the invalid rows in place from the copied dataframe
for item in to_drop:
    checkin_gowalla.drop(item, inplace=True)

In [92]:
# Repeat the process on the other data set
checkin_brightkite = checkin_brightkite_orig.copy()
to_drop = []
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['latitude'] < -90.0) | 
                                  (checkin_brightkite_orig['latitude'] > 90.0)].index)
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['longitude'] < -180.0) | 
                                  (checkin_brightkite_orig['longitude'] > 180.0)].index)
to_drop.append(checkin_brightkite_orig[(checkin_brightkite_orig['latitude'] == 0) & 
                                  (checkin_brightkite_orig['longitude'] == 0)].index)
for item in to_drop:
    checkin_brightkite.drop(item, inplace = True)
# Drop rows with NaN values
checkin_brightkite.dropna(inplace = True)

## Adding country information to check-ins

TODO

In [108]:
def extract_coordinates(row):
    return (row['latitude'], row['longitude'])

def coordinates_to_country(df):
    """
    Maps the coordinates of a dataframe to the country
    code using reverse geocoding
    """
    coordinates_tuples = list(df.parallel_apply(extract_coordinates, axis = 1))
    geocodes = rg.search(coordinates_tuples)
    return [gc['cc'] for gc in geocodes]

In [109]:
# Append country codes to each check-in for both datasets
checkin_brightkite['cc'] = coordinates_to_country(checkin_brightkite)
checkin_gowalla['cc'] = coordinates_to_country(checkin_gowalla)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1122759), Label(value='0 / 1122759…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1610682), Label(value='0 / 1610682…

In [111]:
checkin_brightkite.head()

Unnamed: 0,user,checkin_time,latitude,longitude,location_id,cc,geohash
0,0,2010-10-17T01:48:53Z,39.747652,-104.99251,88c46bf20db295831bd2d1718ad7e6f5,US,9xj6
1,0,2010-10-16T06:02:04Z,39.891383,-105.070814,7a0f88982aa015062b95e3b4843f9ca2,US,9xj6
2,0,2010-10-16T03:48:54Z,39.891077,-105.068532,dd7cd3d264c2d063832db506fba8bf79,US,9xj6
3,0,2010-10-14T18:25:51Z,39.750469,-104.999073,9848afcc62e500a01cf6fbf24b797732f8963683,US,9xj6
4,0,2010-10-14T00:21:47Z,39.752713,-104.996337,2ef143e12038c870038df53e0478cefc,US,9xj6


## Determining the location of user homes

TODO

In [112]:
def geohash_encode(row):
    """
    Computes and returns the geohash with 
    precision 4 for a given row 
    """
    return pgh.encode(row.latitude, row.longitude, precision=4)

def find_home(checkins, user_id):
    """
    Given a user id, finds the most common world cell and 
    computes average location to estimate home location
    """
    group = checkins[checkins['user']  == user_id] # Retrieve all the user's checkins
    geohash = group.groupby('geohash').size().idxmax() # Get the most common geohash for the user
    
    # Keep all checkins that happened in the most common geohash and compute average position
    return group[group['geohash'] == geohash][['latitude', 'longitude']].mean()

def find_home_brightkite(user_id):
    return find_home(checkin_brightkite, user_id)

def find_home_gowalla(user_id):
    return find_home(checkin_gowalla, user_id)

In [113]:
# Compute the geohash for each checkin entry: that will determine the cell in which each checkin belongs
checkin_brightkite['geohash'] = checkin_brightkite.parallel_apply(geohash_encode , axis = 1)
checkin_gowalla['geohash'] = checkin_gowalla.parallel_apply(geohash_encode , axis = 1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1122759), Label(value='0 / 1122759…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1610682), Label(value='0 / 1610682…

In [114]:
brightkite_users = set(checkin_brightkite.user) # All the users that made at least 1 checkin

brightkite_homes = pd.DataFrame(index = brightkite_users) # create empty dataframe with user ids
brightkite_homes = brightkite_homes.index.to_series().parallel_apply(find_home_brightkite) # apply function to all users

brightkite_homes.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12672), Label(value='0 / 12672')))…

Unnamed: 0,latitude,longitude
0,39.783678,-104.992871
1,37.643427,-122.416294
2,39.777215,-105.003989
3,37.765367,-122.42454
4,60.174029,24.937171


In [115]:
gowalla_users = set(checkin_gowalla.user) # All the users that made at least 1 checkin

gowalla_homes = pd.DataFrame(index = gowalla_users) # create empty dataframe with user ids
gowalla_homes = gowalla_homes.index.to_series().parallel_apply(find_home_gowalla) # apply function to all users

gowalla_homes.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=26768), Label(value='0 / 26768')))…

Unnamed: 0,latitude,longitude
0,30.264497,-97.753396
1,45.202484,-0.746831
2,34.046406,-118.341572
4,37.776338,-122.436777
5,30.287875,-97.722523


### Pickle for later use

In [None]:
pickle.dump(checkin_brightkite, open('pickles/checkin_brightkite', 'wb'))
pickle.dump(checkin_gowalla, open('pickles/checkin_gowalla', 'wb'))

In [116]:
# To reload for later
# pickle.load(open('pickles/checkin_brightkite', 'rb'))
# pickle.load(open('pickles/checkin_gowalla', 'rb'))

### Months & Seasons

In [118]:
def get_month(row):
    return datetime.strptime(row['checkin_time'], '%Y-%m-%dT%H:%M:%SZ').month

In [119]:
checkin_brightkite['month'] = checkin_brightkite.parallel_apply(get_month, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1122759), Label(value='0 / 1122759…

In [121]:
checkin_gowalla['month'] = checkin_gowalla.parallel_apply(get_month, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1610682), Label(value='0 / 1610682…

In [120]:
## function that calculates the season of a given datetime object  

Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('winter', (datetime(Y,  1,  1),  datetime(Y,  3, 20))),
           ('spring', (datetime(Y,  3, 21),  datetime(Y,  6, 20))),
           ('summer', (datetime(Y,  6, 21),  datetime(Y,  9, 22))),
           ('autumn', (datetime(Y,  9, 23),  datetime(Y, 12, 20))),
           ('winter', (datetime(Y, 12, 21),  datetime(Y, 12, 31)))]

def get_season(row):
    
    date_time = datetime.strptime(row['checkin_time'], '%Y-%m-%dT%H:%M:%SZ')
    date_time = date_time.replace(year=Y, hour=0, minute=0, second=0)
    
    return next(season for season, (start, end) in seasons
                if start <= date_time <= end)

In [123]:
checkin_brightkite['season'] = checkin_brightkite.parallel_apply(get_season, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1122759), Label(value='0 / 1122759…

In [124]:
checkin_gowalla['season'] = checkin_gowalla.parallel_apply(lambda row: get_season(row), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1610682), Label(value='0 / 1610682…

In [125]:
# Might need this
# month_norm_brightkite = {1:62,2:56,3:73,4:90,5:93,6:90,7:93,8:93,9:90,10:80,11:60,12:62}
# month_norm_gowalla = {1:31,2:53,3:62,4:60,5:62,6:60,7:62,8:62,9:60,10:54,11:30,12:31}

### Identify travelers

In [None]:
# A user is a traveller if he has ever checked in to more than one country
def traveler_check(group):
    return True if len(set(group['cc'])) > 1 else False
# traveler_check = lambda group: 

# Find travellers in both datasets
gowalla_homes['traveler'] = checkin_gowalla.groupby('user').parallel_apply(traveler_check)
brightkite_homes['traveler'] = checkin_brightkite.groupby('user').parallel_apply(traveler_check)

Process ForkPoolWorker-174:
Process ForkPoolWorker-175:
Process ForkPoolWorker-173:
Traceback (most recent call last):


In [None]:
print()