# Milestone P4 : Creative Extension

## Libraries

In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime
import math
import matplotlib.image as mpimg
%matplotlib inline

## The data

In [2]:
DATA_FOLDER = 'data/'

GOWALLA_EDGES_DATASET = DATA_FOLDER+"loc-gowalla_edges.txt.gz"
GOWALLA_CHECKINS_DATASET = DATA_FOLDER+"loc-gowalla_totalCheckins.txt.gz"
BRIGHTKITE_EDGES_DATASET = DATA_FOLDER+"loc-brightkite_edges.txt.gz"
BRIGHTKITE_CHECKINS_DATASET = DATA_FOLDER+"loc-brightkite_totalCheckins.txt.gz"
POPULATION_DATASET = DATA_FOLDER+"pop_per_country.csv"
BRIGHTKITE_HOME_LOCATIONS = DATA_FOLDER+"home_loc_B_country.csv"
GOWALLA_HOME_LOCATIONS = DATA_FOLDER+"home_loc_G_country.csv"

gowalla_edges = pd.read_csv(GOWALLA_EDGES_DATASET, compression = 'gzip', delimiter = '\t', names = ['userA', 'userB'], header=None)
gowalla_checkins = pd.read_csv(GOWALLA_CHECKINS_DATASET, compression = 'gzip', delimiter = '\t', names = ['user', 'checkin_time', 'latitude', 'longitude', 'location_id'], header=None)
brightkite_edges = pd.read_csv(BRIGHTKITE_EDGES_DATASET, compression = 'gzip', delimiter = '\t', names = ['userA', 'userB'], header=None)
brightkite_checkins = pd.read_csv(BRIGHTKITE_CHECKINS_DATASET, compression = 'gzip', delimiter = '\t', names = ['user', 'checkin_time', 'latitude', 'longitude', 'location_id'], header=None)
population = pd.read_csv(POPULATION_DATASET, index_col = 0, names = ['country', 'population'], header = 0)
brightkite_home_locations = pd.read_csv(BRIGHTKITE_HOME_LOCATIONS, sep =';', index_col = 'user')
gowalla_home_locations = pd.read_csv(GOWALLA_HOME_LOCATIONS, sep =';', index_col = 'user')

In [3]:
population = population.replace('united states', 'united states of america')
population = population.replace('netherlands', 'the netherlands')

In [4]:
population.head(3)

Unnamed: 0,country,population
0,afghanistan,31056997
1,albania,3581655
2,algeria,32930091


In [5]:
population.sort_values(by = ['population'], ascending = False).head(10)

Unnamed: 0,country,population
24,china,1313973713
51,india,1095351995
127,united states of america,298444215
52,indonesia,245452739
15,brazil,188078227
93,pakistan,165803560
9,bangladesh,147365352
101,russia,142893540
91,nigeria,131859731
59,japan,127463611


In [6]:
brightkite_home_locations.head(3)

Unnamed: 0_level_0,lat,lon,country
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,39.747913,-104.968944,United States of America
1,37.599899,-122.372723,United States of America
2,39.738874,-104.954143,United States of America


In [7]:
gowalla_home_locations.head(3)

Unnamed: 0_level_0,lat,lon,country
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,30.259167,-97.750223,United States of America
1,50.381005,3.43071,France
2,34.052243,-118.327641,United States of America


## Step 1 : defining the geographic boundaries of our study

Before starting the study, we have to determine with which countries it would relevant to work. To estimate the quantity of data available in each country, we compute the number of homes, as defined in the replication, (i.e. users) that are located in each country. 

The point is to work only on countries in which the amount of data seems to be enough to have robust conclusions. 

In [8]:
brightkite_home_locations['country'] = brightkite_home_locations['country'].str.lower()
brightkite_home_locations['country'] = brightkite_home_locations['country'].str.lstrip()

gowalla_home_locations['country'] = gowalla_home_locations['country'].str.lower()
gowalla_home_locations['country'] = gowalla_home_locations['country'].str.lstrip()

In [9]:
brightkite_homes_per_country = brightkite_home_locations.groupby(['country']).count().sort_values(by = ['lat'], ascending = False)
brightkite_homes_per_country.drop(columns = ['lon'], inplace = True)
brightkite_homes_per_country.rename(columns={'lat':'count'}, inplace = True)
brightkite_homes_per_country.reset_index(inplace = True)

gowalla_homes_per_country = gowalla_home_locations.groupby(['country']).count().sort_values(by = ['lat'], ascending = False)
gowalla_homes_per_country.drop(columns = ['lon'], inplace = True)
gowalla_homes_per_country.rename(columns={'lat':'count'}, inplace = True)
gowalla_homes_per_country.reset_index(inplace = True)

gowalla_homes_per_country.head(10)

Unnamed: 0,country,count
0,undefined,60323
1,united states of america,26419
2,sweden,5958
3,united kingdom,2317
4,germany,1930
5,norway,1074
6,canada,1046
7,saudi arabia,720
8,thailand,543
9,australia,534


In [10]:
brightkite_homes_per_country = brightkite_homes_per_country.merge(population, how = 'inner', on = ['country'])
gowalla_homes_per_country = gowalla_homes_per_country.merge(population, how = 'inner', on = ['country'])

In [11]:
brightkite_homes_per_country.head(3)

Unnamed: 0,country,count,population
0,united states of america,28006,298444215
1,united kingdom,2815,60609153
2,japan,2678,127463611


In [12]:
gowalla_homes_per_country.head(3)

Unnamed: 0,country,count,population
0,united states of america,26419,298444215
1,sweden,5958,9016596
2,united kingdom,2317,60609153


The number of home per country is normalized by the population of the country. 

In [13]:
brightkite_homes_per_country['count_normalized'] = brightkite_homes_per_country['count']/brightkite_homes_per_country['population']
gowalla_homes_per_country['count_normalized'] = gowalla_homes_per_country['count']/gowalla_homes_per_country['population']

In [14]:
brightkite_homes_per_country.sort_values(by = 'count_normalized', ascending = False).head(10)

Unnamed: 0,country,count,population,count_normalized
10,norway,447,4610820,9.7e-05
0,united states of america,28006,298444215,9.4e-05
41,luxembourg,39,474413,8.2e-05
7,sweden,667,9016596,7.4e-05
3,australia,1253,20264082,6.2e-05
13,finland,289,5231372,5.5e-05
1,united kingdom,2815,60609153,4.6e-05
6,the netherlands,705,16491461,4.3e-05
4,canada,1250,33098932,3.8e-05
22,singapore,157,4492150,3.5e-05


In [15]:
gowalla_homes_per_country.sort_values(by = 'count_normalized', ascending = False).head(10)

Unnamed: 0,country,count,population,count_normalized
1,sweden,5958,9016596,0.000661
4,norway,1074,4610820,0.000233
35,luxembourg,53,474413,0.000112
0,united states of america,26419,298444215,8.9e-05
10,belgium,511,10379067,4.9e-05
14,switzerland,325,7523934,4.3e-05
16,denmark,210,5450661,3.9e-05
2,united kingdom,2317,60609153,3.8e-05
22,singapore,164,4492150,3.7e-05
9,the netherlands,532,16491461,3.2e-05


Let's say that we will only work with countries which count_normalized value is higher to a certain rate. 

In [19]:
rate = 0.00001
brightkite_countries = brightkite_homes_per_country[brightkite_homes_per_country.count_normalized >= rate]
gowalla_countries = gowalla_homes_per_country[gowalla_homes_per_country.count_normalized >= rate]
print('With this rate,', len(brightkite_countries), 'countries are taken in the study for the Brightkite dataset.')
print('With this rate,', len(gowalla_countries), 'countries are taken in the study for the Gowalla dataset.')

With this rate, 27 countries are taken in the study for the Brightkite dataset.
With this rate, 27 countries are taken in the study for the Gowalla dataset.


In [22]:
countries = pd.merge(brightkite_countries, gowalla_countries, how = 'inner', on = ['country']) # countries that are ok both for the Gowalla and Brightkite datasets
print('With this rate,', len(countries), 'countries are taken in the study for both datasets.')

With this rate, 23 countries are taken in the study for both datasets.


In [23]:
countries.head(5)

Unnamed: 0,country,count_x,population_x,count_normalized_x,count_y,population_y,count_normalized_y
0,united states of america,28006,298444215,9.4e-05,26419,298444215,8.9e-05
1,united kingdom,2815,60609153,4.6e-05,2317,60609153,3.8e-05
2,australia,1253,20264082,6.2e-05,534,20264082,2.6e-05
3,canada,1250,33098932,3.8e-05,1046,33098932,3.2e-05
4,germany,1195,82422299,1.4e-05,1930,82422299,2.3e-05
