# Milestone P4 : Creative Extension

## Libraries

In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime
import math
import matplotlib.image as mpimg
%matplotlib inline

## The data

In [2]:
DATA_FOLDER = 'data/'

GOWALLA_EDGES_DATASET = DATA_FOLDER+"loc-gowalla_edges.txt.gz"
GOWALLA_CHECKINS_DATASET = DATA_FOLDER+"loc-gowalla_totalCheckins.txt.gz"
BRIGHTKITE_EDGES_DATASET = DATA_FOLDER+"loc-brightkite_edges.txt.gz"
BRIGHTKITE_CHECKINS_DATASET = DATA_FOLDER+"loc-brightkite_totalCheckins.txt.gz"
POPULATION_DATASET = DATA_FOLDER+"pop_per_country.csv"
BRIGHTKITE_HOME_LOCATIONS = DATA_FOLDER+"home_loc_B_country.csv"
GOWALLA_HOME_LOCATIONS = DATA_FOLDER+"home_loc_G_country.csv"

gowalla_edges = pd.read_csv(GOWALLA_EDGES_DATASET, compression = 'gzip', delimiter = '\t', names = ['userA', 'userB'], header=None)
gowalla_checkins = pd.read_csv(GOWALLA_CHECKINS_DATASET, compression = 'gzip', delimiter = '\t', names = ['user', 'checkin_time', 'latitude', 'longitude', 'location_id'], header=None)
brightkite_edges = pd.read_csv(BRIGHTKITE_EDGES_DATASET, compression = 'gzip', delimiter = '\t', names = ['userA', 'userB'], header=None)
brightkite_checkins = pd.read_csv(BRIGHTKITE_CHECKINS_DATASET, compression = 'gzip', delimiter = '\t', names = ['user', 'checkin_time', 'latitude', 'longitude', 'location_id'], header=None)
population = pd.read_csv(POPULATION_DATASET, index_col = 0, names = ['country', 'population'], header = 0)
brightkite_home_locations = pd.read_csv(BRIGHTKITE_HOME_LOCATIONS, sep =';', index_col = 'user')
gowalla_home_locations = pd.read_csv(GOWALLA_HOME_LOCATIONS, sep =';', index_col = 'user')

In [3]:
population = population.replace('united states', 'united states of america')
population = population.replace('netherlands', 'the netherlands')

In [4]:
population.head(3)

Unnamed: 0,country,population
0,afghanistan,31056997
1,albania,3581655
2,algeria,32930091


In [5]:
population.sort_values(by = ['population'], ascending = False).head(10)

Unnamed: 0,country,population
24,china,1313973713
51,india,1095351995
127,united states of america,298444215
52,indonesia,245452739
15,brazil,188078227
93,pakistan,165803560
9,bangladesh,147365352
101,russia,142893540
91,nigeria,131859731
59,japan,127463611


In [6]:
brightkite_home_locations.head(3)

Unnamed: 0_level_0,lat,lon,country
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,39.747913,-104.968944,United States of America
1,37.599899,-122.372723,United States of America
2,39.738874,-104.954143,United States of America


In [7]:
gowalla_home_locations.head(3)

Unnamed: 0_level_0,lat,lon,country
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,30.259167,-97.750223,United States of America
1,50.381005,3.43071,France
2,34.052243,-118.327641,United States of America


## Step 1 : defining the geographic boundaries of our study

Before starting the study, we have to determine with which countries it would relevant to work. To estimate the quantity of data available in each country, we compute the number of homes, as defined in the replication, (i.e. users) that are located in each country. 

The point is to work only on countries in which the amount of data seems to be enough to have robust conclusions. 

In [8]:
brightkite_home_locations['country'] = brightkite_home_locations['country'].str.lower()
brightkite_home_locations['country'] = brightkite_home_locations['country'].str.lstrip()

gowalla_home_locations['country'] = gowalla_home_locations['country'].str.lower()
gowalla_home_locations['country'] = gowalla_home_locations['country'].str.lstrip()

In [9]:
brightkite_homes_per_country = brightkite_home_locations.groupby(['country']).count().sort_values(by = ['lat'], ascending = False)
brightkite_homes_per_country.drop(columns = ['lon'], inplace = True)
brightkite_homes_per_country.rename(columns={'lat':'count'}, inplace = True)
brightkite_homes_per_country.reset_index(inplace = True)

gowalla_homes_per_country = gowalla_home_locations.groupby(['country']).count().sort_values(by = ['lat'], ascending = False)
gowalla_homes_per_country.drop(columns = ['lon'], inplace = True)
gowalla_homes_per_country.rename(columns={'lat':'count'}, inplace = True)
gowalla_homes_per_country.reset_index(inplace = True)

gowalla_homes_per_country.head(10)

Unnamed: 0,country,count
0,united states of america,51328
1,sweden,18034
2,united kingdom,5433
3,germany,5214
4,norway,3650
5,canada,2221
6,saudi arabia,1917
7,thailand,1655
8,belgium,1569
9,australia,1310


In [19]:
brightkite_homes_per_country.head(10)

Unnamed: 0,country,count,population,count_normalized
0,united states of america,30734,298444215,0.000103
1,united kingdom,3243,60609153,5.4e-05
2,japan,2860,127463611,2.2e-05
3,australia,1453,20264082,7.2e-05
4,canada,1415,33098932,4.3e-05
5,germany,1345,82422299,1.6e-05
6,sweden,835,9016596,9.3e-05
7,the netherlands,791,16491461,4.8e-05
8,italy,631,58133509,1.1e-05
9,norway,591,4610820,0.000128


In [10]:
brightkite_homes_per_country = brightkite_homes_per_country.merge(population, how = 'inner', on = ['country'])
gowalla_homes_per_country = gowalla_homes_per_country.merge(population, how = 'inner', on = ['country'])

In [11]:
brightkite_homes_per_country.head(3)

Unnamed: 0,country,count,population
0,united states of america,30734,298444215
1,united kingdom,3243,60609153
2,japan,2860,127463611


In [12]:
gowalla_homes_per_country.head(3)

Unnamed: 0,country,count,population
0,united states of america,51328,298444215
1,sweden,18034,9016596
2,united kingdom,5433,60609153


The number of home per country is normalized by the population of the country. 

In [13]:
brightkite_homes_per_country['count_normalized'] = brightkite_homes_per_country['count']/brightkite_homes_per_country['population']
gowalla_homes_per_country['count_normalized'] = gowalla_homes_per_country['count']/gowalla_homes_per_country['population']

In [14]:
brightkite_homes_per_country.sort_values(by = 'count_normalized', ascending = False).head(10)

Unnamed: 0,country,count,population,count_normalized
9,norway,591,4610820,0.000128
0,united states of america,30734,298444215,0.000103
6,sweden,835,9016596,9.3e-05
41,luxembourg,40,474413,8.4e-05
3,australia,1453,20264082,7.2e-05
13,finland,367,5231372,7e-05
1,united kingdom,3243,60609153,5.4e-05
7,the netherlands,791,16491461,4.8e-05
4,canada,1415,33098932,4.3e-05
38,estonia,53,1324333,4e-05


In [15]:
gowalla_homes_per_country.sort_values(by = 'count_normalized', ascending = False).head(10)

Unnamed: 0,country,count,population,count_normalized
1,sweden,18034,9016596,0.002
4,norway,3650,4610820,0.000792
29,luxembourg,171,474413,0.00036
0,united states of america,51328,298444215,0.000172
8,belgium,1569,10379067,0.000151
11,switzerland,989,7523934,0.000131
17,denmark,606,5450661,0.000111
22,singapore,445,4492150,9.9e-05
2,united kingdom,5433,60609153,9e-05
10,the netherlands,1258,16491461,7.6e-05


Let's say that we will only work with countries which count_normalized value is higher to a certain rate. 

In [16]:
rate = 0.00001
brightkite_countries = brightkite_homes_per_country[brightkite_homes_per_country.count_normalized >= rate]
gowalla_countries = gowalla_homes_per_country[gowalla_homes_per_country.count_normalized >= rate]
print('With this rate,', len(brightkite_countries), 'countries are taken in the study for the Brightkite dataset.')
print('With this rate,', len(gowalla_countries), 'countries are taken in the study for the Gowalla dataset.')

With this rate, 28 countries are taken in the study for the Brightkite dataset.
With this rate, 38 countries are taken in the study for the Gowalla dataset.


In [17]:
countries = pd.merge(brightkite_countries, gowalla_countries, how = 'inner', on = ['country']) # countries that are ok both for the Gowalla and Brightkite datasets
print('With this rate,', len(countries), 'countries are taken in the study for both datasets.')

With this rate, 25 countries are taken in the study for both datasets.


In [18]:
countries.head(5)

Unnamed: 0,country,count_x,population_x,count_normalized_x,count_y,population_y,count_normalized_y
0,united states of america,30734,298444215,0.000103,51328,298444215,0.000172
1,united kingdom,3243,60609153,5.4e-05,5433,60609153,9e-05
2,australia,1453,20264082,7.2e-05,1310,20264082,6.5e-05
3,canada,1415,33098932,4.3e-05,2221,33098932,6.7e-05
4,germany,1345,82422299,1.6e-05,5214,82422299,6.3e-05


# Distance from home

In [22]:
gowalla_checkins = pd.merge(gowalla_checkins, gowalla_home_locations, how = 'left', on = ['user'])
brightkite_checkins = pd.merge(brightkite_checkins, brightkite_home_locations, how = 'left', on = ['user'])

In [24]:
gowalla_checkins.rename(columns={'lat':'home_lat','lon':'home_lon'}, inplace = True)
brightkite_checkins.rename(columns={'lat':'home_lat','lon':'home_lon'}, inplace = True)

In [29]:
import haversine as hvrs

In [30]:
def distance_from_home(row):
    '''computes the distance of the check-in from the user's home'''
    lat1 = row['latitude']
    lat2 = row['home_lat']
    long1 = row['longitude']
    long2 = row['home_lon']
    return(hvrs.haversine((lat1, long1), (lat2, long2)))

In [31]:
gowalla_checkins['distance_from_home'] = gowalla_checkins.apply(lambda row: distance_from_home(row), axis = 1)

In [38]:
brightkite_checkins['distance_from_home'] = brightkite_checkins.apply(lambda row: distance_from_home(row), axis = 1)

In [39]:
brightkite_checkins.head(1)

Unnamed: 0,user,checkin_time,latitude,longitude,location_id,home_lat,home_lon,country,distance_from_home
0,0,2010-10-17T01:48:53Z,39.747652,-104.99251,88c46bf20db295831bd2d1718ad7e6f5,39.747913,-104.968944,united states of america,2.014929


In [40]:
distance = 200 #km
gowalla_far_checkins = gowalla_checkins[gowalla_checkins.distance_from_home >= distance]
brightkite_far_checkins = brightkite_checkins[brightkite_checkins.distance_from_home >= distance]
len(gowalla_far_checkins), len(brightkite_far_checkins)

(942501, 591617)

In [47]:
gowalla_far_checkins_per_country = gowalla_far_checkins.groupby(['country']).count().sort_values(by = 'user', ascending = False)
gowalla_far_checkins_per_country.rename(columns = {'user':'count'}, inplace = True)
gowalla_far_checkins_per_country.drop(gowalla_far_checkins_per_country.columns.difference(['count']), 1, inplace=True)
gowalla_far_checkins_per_country.head(10)

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
united states of america,638386
sweden,103685
germany,39042
united kingdom,29177
norway,23818
canada,15616
saudi arabia,12494
australia,7209
france,6164
belgium,4676


In [48]:
brightkite_far_checkins_per_country = brightkite_far_checkins.groupby(['country']).count().sort_values(by = 'user', ascending = False)
brightkite_far_checkins_per_country.rename(columns = {'user':'count'}, inplace = True)
brightkite_far_checkins_per_country.drop(brightkite_far_checkins_per_country.columns.difference(['count']), 1, inplace=True)
brightkite_far_checkins_per_country.head(10)

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
united states of america,399685
japan,65018
united kingdom,17390
germany,11122
australia,10638
canada,9384
sweden,8657
france,7535
spain,7043
italy,5559
