# Milestone P4 : Creative Extension

## Libraries

In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime
import math
import matplotlib.image as mpimg
%matplotlib inline

## The data

In [2]:
DATA_FOLDER = 'data/'

GOWALLA_CHECKINS_DATASET = DATA_FOLDER+"G_loc_country.csv"
BRIGHTKITE_CHECKINS_DATASET = DATA_FOLDER+"B_loc_country.csv"
POPULATION_DATASET = DATA_FOLDER+"pop_per_country.csv"
BRIGHTKITE_HOME_LOCATIONS = DATA_FOLDER+"new_home_loc_B_country.csv"
GOWALLA_HOME_LOCATIONS = DATA_FOLDER+"new_home_loc_G_country.csv"
COUNTRY_CODES = DATA_FOLDER+"countrycodes.csv"
LANGUAGE_COUNTRIES = DATA_FOLDER+"language_list_by_country.xlsx"
LANGUAGE_COUNTRIES_BIS = DATA_FOLDER+"languages.xlsx"
BRIGHTKITE_TRIPS = DATA_FOLDER+"country_brightkite_trips.csv"
GOWALLA_TRIPS = DATA_FOLDER+"country_gowalla_trips.csv"
COUNTRIES_GDP = DATA_FOLDER+"countries_of_the_world.csv"

BRIGHTKITE_TRIPS_FULL = DATA_FOLDER+"brightkite_trips_full_infos.csv"
GOWALLA_TRIPS_FULL = DATA_FOLDER+"gowalla_trips_full_infos.csv"

gowalla_checkins = pd.read_csv(GOWALLA_CHECKINS_DATASET, delimiter = ',', usecols = [1,2,3,4,5])
brightkite_checkins = pd.read_csv(BRIGHTKITE_CHECKINS_DATASET, delimiter = ',', usecols = [1,2,3,4,5])
population = pd.read_csv(POPULATION_DATASET, index_col = 0, names = ['country', 'population'], header = 0)
brightkite_home_locations = pd.read_csv(BRIGHTKITE_HOME_LOCATIONS, sep =',')
gowalla_home_locations = pd.read_csv(GOWALLA_HOME_LOCATIONS, sep =',')
country_codes = pd.read_csv(COUNTRY_CODES, sep =',')
languages = pd.read_excel(LANGUAGE_COUNTRIES, names = ['country', 'language'])
languages2 = pd.read_excel(LANGUAGE_COUNTRIES_BIS, names = ['country', 'language'])
brightkite_trips = pd.read_csv(BRIGHTKITE_TRIPS)
gowalla_trips = pd.read_csv(GOWALLA_TRIPS)
brightkite_trips_full = pd.read_csv(BRIGHTKITE_TRIPS_FULL)
gowalla_trips_full = pd.read_csv(GOWALLA_TRIPS_FULL)
countries_gdp = pd.read_csv(COUNTRIES_GDP, delimiter = ',', usecols = [0, 8], names = ['country', 'gdp'], header = 0)

## Cleaning the data

In [3]:
population.sort_values(by = ['population'], ascending = False).head(10)

Unnamed: 0,country,population
24,china,1313973713
51,india,1095351995
127,united states,298444215
52,indonesia,245452739
15,brazil,188078227
93,pakistan,165803560
9,bangladesh,147365352
101,russia,142893540
91,nigeria,131859731
59,japan,127463611


In [4]:
brightkite_home_locations.head(3)

Unnamed: 0,user,lat,lon,country
0,0,39.747913,-104.968944,US
1,1,37.599899,-122.372723,US
2,2,39.738874,-104.954143,US


In [5]:
gowalla_home_locations.head(3)

Unnamed: 0,user,lat,lon,country
0,0,30.259167,-97.750223,US
1,1,50.381005,3.43071,FR
2,2,34.052243,-118.327641,US


In [6]:
country_codes.head()

Unnamed: 0,Name,Code
0,Afghanistan,AF
1,Åland Islands,AX
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


In [7]:
gowalla_home_locations = pd.merge(gowalla_home_locations, country_codes, left_on = 'country', right_on = 'Code', how = 'left')
gowalla_home_locations.drop(columns = ['country', 'Code'], inplace = True)
gowalla_home_locations.rename(columns = {'Name':'country'}, inplace = True)

brightkite_home_locations = pd.merge(brightkite_home_locations, country_codes, left_on = 'country', right_on = 'Code', how = 'left')
brightkite_home_locations.drop(columns = ['country', 'Code'], inplace = True)
brightkite_home_locations.rename(columns = {'Name':'country'}, inplace = True)

In [8]:
gowalla_checkins = pd.merge(gowalla_checkins, country_codes, left_on = 'country', right_on = 'Code', how = 'left')
gowalla_checkins.drop(columns = ['country', 'Code'], inplace = True)
gowalla_checkins.rename(columns = {'Name':'country'}, inplace = True)

brightkite_checkins = pd.merge(brightkite_checkins, country_codes, left_on = 'country', right_on = 'Code', how = 'left')
brightkite_checkins.drop(columns = ['country', 'Code'], inplace = True)
brightkite_checkins.rename(columns = {'Name':'country'}, inplace = True)

In [9]:
brightkite_home_locations.head()

Unnamed: 0,user,lat,lon,country
0,0,39.747913,-104.968944,United States
1,1,37.599899,-122.372723,United States
2,2,39.738874,-104.954143,United States
3,3,37.780875,-122.406171,United States
4,4,60.174186,24.937477,Finland


In [10]:
gowalla_checkins['time'] = pd.to_datetime(gowalla_checkins['time'])
brightkite_checkins['time'] = pd.to_datetime(brightkite_checkins['time'])

In [11]:
brightkite_checkins.head()

Unnamed: 0,user,time,lat,lon,country
0,0,2010-10-17 01:48:53+00:00,39.747652,-104.99251,United States
1,0,2010-10-16 06:02:04+00:00,39.891383,-105.070814,United States
2,0,2010-10-16 03:48:54+00:00,39.891077,-105.068532,United States
3,0,2010-10-14 18:25:51+00:00,39.750469,-104.999073,United States
4,0,2010-10-14 00:21:47+00:00,39.752713,-104.996337,United States


## Step 1 : defining the geographic boundaries of our study

Before starting the study, we have to determine with which countries it would relevant to work. To estimate the quantity of data available in each country, we compute the number of homes, as defined in the replication, (i.e. users) that are located in each country. 

The point is to work only on countries in which the amount of data seems to be enough to have robust conclusions. 

In [12]:
brightkite_home_locations['country'] = brightkite_home_locations['country'].str.lower()
brightkite_checkins['country'] = brightkite_checkins['country'].str.lower()

gowalla_home_locations['country'] = gowalla_home_locations['country'].str.lower()
gowalla_checkins['country'] = gowalla_checkins['country'].str.lower()

In [13]:
brightkite_homes_per_country = brightkite_home_locations.groupby(['country']).count().sort_values(by = ['lat'], ascending = False)
brightkite_homes_per_country.drop(columns = ['lon', 'user'], inplace = True)
brightkite_homes_per_country.rename(columns={'lat':'count'}, inplace = True)
brightkite_homes_per_country.reset_index(inplace = True)

gowalla_homes_per_country = gowalla_home_locations.groupby(['country']).count().sort_values(by = ['lat'], ascending = False)
gowalla_homes_per_country.drop(columns = ['lon', 'user'], inplace = True)
gowalla_homes_per_country.rename(columns={'lat':'count'}, inplace = True)
gowalla_homes_per_country.reset_index(inplace = True)

gowalla_homes_per_country.head(10)

Unnamed: 0,country,count
0,united states,50639
1,sweden,18029
2,united kingdom,5435
3,germany,5213
4,norway,3662
5,canada,2210
6,saudi arabia,1917
7,thailand,1660
8,belgium,1569
9,australia,1311


In [14]:
brightkite_homes_per_country.head(10)

Unnamed: 0,country,count
0,united states,30669
1,united kingdom,3243
2,japan,2860
3,australia,1452
4,canada,1409
5,germany,1344
6,sweden,833
7,netherlands,789
8,italy,614
9,norway,593


In [15]:
brightkite_homes_per_country = brightkite_homes_per_country.merge(population, how = 'inner', on = ['country'])
gowalla_homes_per_country = gowalla_homes_per_country.merge(population, how = 'inner', on = ['country'])

In [16]:
brightkite_homes_per_country.head(3)

Unnamed: 0,country,count,population
0,united states,30669,298444215
1,united kingdom,3243,60609153
2,japan,2860,127463611


In [17]:
gowalla_homes_per_country.head(3)

Unnamed: 0,country,count,population
0,united states,50639,298444215
1,sweden,18029,9016596
2,united kingdom,5435,60609153


The number of home per country is normalized by the population of the country. 

In [18]:
brightkite_homes_per_country['count_normalized'] = brightkite_homes_per_country['count']/brightkite_homes_per_country['population']
gowalla_homes_per_country['count_normalized'] = gowalla_homes_per_country['count']/gowalla_homes_per_country['population']

In [19]:
brightkite_homes_per_country.sort_values(by = 'count_normalized', ascending = False).head(10)

Unnamed: 0,country,count,population,count_normalized
9,norway,593,4610820,0.000129
0,united states,30669,298444215,0.000103
6,sweden,833,9016596,9.2e-05
40,luxembourg,40,474413,8.4e-05
3,australia,1452,20264082,7.2e-05
12,finland,367,5231372,7e-05
1,united kingdom,3243,60609153,5.4e-05
7,netherlands,789,16491461,4.8e-05
4,canada,1409,33098932,4.3e-05
36,estonia,53,1324333,4e-05


In [20]:
gowalla_homes_per_country.sort_values(by = 'count_normalized', ascending = False).head(10)

Unnamed: 0,country,count,population,count_normalized
1,sweden,18029,9016596,0.002
4,norway,3662,4610820,0.000794
29,luxembourg,172,474413,0.000363
0,united states,50639,298444215,0.00017
8,belgium,1569,10379067,0.000151
11,switzerland,994,7523934,0.000132
17,denmark,607,5450661,0.000111
2,united kingdom,5435,60609153,9e-05
21,singapore,392,4492150,8.7e-05
10,netherlands,1259,16491461,7.6e-05


Let's say that we will only work with countries which count_normalized value is higher to a certain rate. 

In [21]:
rate = 0.00001
brightkite_countries = brightkite_homes_per_country[brightkite_homes_per_country.count_normalized >= rate]
gowalla_countries = gowalla_homes_per_country[gowalla_homes_per_country.count_normalized >= rate]
print('With this rate,', len(brightkite_countries), 'countries are taken in the study for the Brightkite dataset.')
print('With this rate,', len(gowalla_countries), 'countries are taken in the study for the Gowalla dataset.')

With this rate, 28 countries are taken in the study for the Brightkite dataset.
With this rate, 38 countries are taken in the study for the Gowalla dataset.


In [22]:
countries = pd.merge(brightkite_countries, gowalla_countries, how = 'inner', on = ['country'], suffixes=('_brightkite', '_gowalla')) # countries that are ok both for the Gowalla and Brightkite datasets
print('With this rate,', len(countries), 'countries are taken in the study for both datasets.')

With this rate, 25 countries are taken in the study for both datasets.


In [23]:
countries.head(5)

Unnamed: 0,country,count_brightkite,population_brightkite,count_normalized_brightkite,count_gowalla,population_gowalla,count_normalized_gowalla
0,united states,30669,298444215,0.000103,50639,298444215,0.00017
1,united kingdom,3243,60609153,5.4e-05,5435,60609153,9e-05
2,australia,1452,20264082,7.2e-05,1311,20264082,6.5e-05
3,canada,1409,33098932,4.3e-05,2210,33098932,6.7e-05
4,germany,1344,82422299,1.6e-05,5213,82422299,6.3e-05


## Distance from home

In [24]:
gowalla_home_locations.rename(columns={'lat':'home_lat','lon':'home_lon'}, inplace = True)
brightkite_home_locations.rename(columns={'lat':'home_lat','lon':'home_lon'}, inplace = True)

In [25]:
gowalla_checkins = pd.merge(gowalla_checkins, gowalla_home_locations, how = 'left', on = ['user'], suffixes = ('_checkin', '_home'))
brightkite_checkins = pd.merge(brightkite_checkins, brightkite_home_locations, how = 'left', on = ['user'], suffixes = ('_checkin', '_home'))

In [26]:
import haversine as hvrs

In [27]:
def distance_from_home(row):
    '''computes the distance between the check-in and the user's home'''
    lat1 = row['lat']
    lat2 = row['home_lat']
    long1 = row['lon']
    long2 = row['home_lon']
    return(hvrs.haversine((lat1, long1), (lat2, long2)))

In [28]:
gowalla_checkins['distance_from_home'] = gowalla_checkins.apply(lambda row: distance_from_home(row), axis = 1)
brightkite_checkins['distance_from_home'] = brightkite_checkins.apply(lambda row: distance_from_home(row), axis = 1)

In [29]:
gowalla_checkins.head(3)

Unnamed: 0,user,time,lat,lon,country_checkin,home_lat,home_lon,country_home,distance_from_home
0,0,2010-10-19 23:55:27+00:00,30.235909,-97.79514,united states,30.259167,-97.750223,united states,5.030254
1,0,2010-10-18 22:17:43+00:00,30.269103,-97.749395,united states,30.259167,-97.750223,united states,1.107683
2,0,2010-10-17 23:42:03+00:00,30.255731,-97.763386,united states,30.259167,-97.750223,united states,1.320713


In [30]:
distance = 200 #km
gowalla_far_checkins = gowalla_checkins[gowalla_checkins.distance_from_home >= distance]
brightkite_far_checkins = brightkite_checkins[brightkite_checkins.distance_from_home >= distance]
len(gowalla_far_checkins), len(brightkite_far_checkins)

(942618, 511709)

In [31]:
gowalla_far_checkins.head(3)

Unnamed: 0,user,time,lat,lon,country_checkin,home_lat,home_lon,country_home,distance_from_home
10,0,2010-10-12 00:21:28+00:00,40.643885,-73.782806,united states,30.259167,-97.750223,united states,2447.558851
11,0,2010-10-11 20:21:20+00:00,40.741374,-73.988105,united states,30.259167,-97.750223,united states,2435.235803
12,0,2010-10-11 20:20:42+00:00,40.741388,-73.989455,united states,30.259167,-97.750223,united states,2435.130291


In [32]:
gowalla_far_checkins_per_country = gowalla_far_checkins.groupby(['country_home']).count().sort_values(by = 'user', ascending = False)
gowalla_far_checkins_per_country.rename(columns = {'user':'count'}, inplace = True)
gowalla_far_checkins_per_country.drop(gowalla_far_checkins_per_country.columns.difference(['count']), 1, inplace=True)
gowalla_far_checkins_per_country.head(10)

Unnamed: 0_level_0,count
country_home,Unnamed: 1_level_1
united states,634893
sweden,103646
germany,39033
united kingdom,29169
norway,23863
canada,15475
saudi arabia,12489
australia,7229
france,6170
belgium,4696


In [33]:
brightkite_far_checkins_per_country = brightkite_far_checkins.groupby(['country_home']).count().sort_values(by = 'user', ascending = False)
brightkite_far_checkins_per_country.rename(columns = {'user':'count'}, inplace = True)
brightkite_far_checkins_per_country.drop(brightkite_far_checkins_per_country.columns.difference(['count']), 1, inplace=True)
brightkite_far_checkins_per_country.head(10)

Unnamed: 0_level_0,count
country_home,Unnamed: 1_level_1
united states,358789
japan,41834
united kingdom,15637
australia,10371
germany,10175
sweden,8575
canada,7780
france,6999
spain,4893
norway,4362


# Step 2 : countries where users travelled 

# Languages

In [34]:
languages2['country'] = languages2['country'].str.strip()
languages2['country'] = languages2['country'].str.lower()
languages2['language'] = languages2['language'].str.lower()

In [35]:
languages2.head(20)

Unnamed: 0,country,language
0,afghanistan,dari (official) 77% (dari functions as the lin...
1,albania,albanian 98.8% (official - derived from tosk d...
2,algeria,"arabic (official), french (lingua franca), ber..."
3,andorra,"catalan (official), french, castilian, portuguese"
4,angola,"portuguese 71.2% (official), umbundu 23%, kiko..."
5,antigua and barbuda,"english (official), antiguan creole"
6,argentina,"spanish (official), italian, english, german, ..."
7,armenia,"armenian (official) 97.9%, kurdish (spoken by ..."
8,australia,"english 72.7%, mandarin 2.5%, arabic 1.4%, can..."
9,austria,"german (official nationwide) 88.6%, turkish 2...."


In [36]:
# Only keep the first word (either separated by a comma or a space from other words)
languages2['language'] = languages2['language'].str.partition(' ')[0]
languages2['language'] = languages2['language'].str.partition(',')[0]

In [37]:
languages2.head(20)

Unnamed: 0,country,language
0,afghanistan,dari
1,albania,albanian
2,algeria,arabic
3,andorra,catalan
4,angola,portuguese
5,antigua and barbuda,english
6,argentina,spanish
7,armenia,armenian
8,australia,english
9,austria,german


In [38]:
languages2[languages2.country == 'belgium']

Unnamed: 0,country,language
16,belgium,dutch


In [39]:
languages2[languages2.language.str.contains('french') == True]

Unnamed: 0,country,language
18,benin,french
26,burkina faso,french
33,central african republic,french
34,chad,french
39,"congo, democratic republic of the",french
40,"congo, republic of",french
42,côte d'ivoire,french
48,djibouti,french
62,france,french
63,gabon,french


In [40]:
gowalla_trips = pd.merge(gowalla_trips, languages2, how = 'left', left_on = 'country_home', right_on = 'country')
gowalla_trips.drop(columns = 'country', inplace = True)
gowalla_trips.rename(columns = {'language':'language_home'}, inplace = True)

gowalla_trips = pd.merge(gowalla_trips, languages2, how = 'left', left_on = 'country_checkin', right_on = 'country')
gowalla_trips.drop(columns = 'country', inplace = True)
gowalla_trips.rename(columns = {'language':'language_checkin'}, inplace = True)

gowalla_trips.head()

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin
0,afghanistan,belgium,1,dari,dutch
1,afghanistan,united arab emirates,1,dari,arabic
2,afghanistan,united kingdom,1,dari,english
3,albania,hungary,1,albanian,hungarian
4,albania,spain,1,albanian,castilian


In [41]:
brightkite_trips = pd.merge(brightkite_trips, languages2, how = 'left', left_on = 'country_home', right_on = 'country')
brightkite_trips.drop(columns = 'country', inplace = True)
brightkite_trips.rename(columns = {'language':'language_home'}, inplace = True)

brightkite_trips = pd.merge(brightkite_trips, languages2, how = 'left', left_on = 'country_checkin', right_on = 'country')
brightkite_trips.drop(columns = 'country', inplace = True)
brightkite_trips.rename(columns = {'language':'language_checkin'}, inplace = True)

brightkite_trips.head()

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin
0,algeria,france,1,arabic,french
1,angola,portugal,1,portuguese,portuguese
2,argentina,benin,2,spanish,french
3,argentina,brazil,1,spanish,portuguese
4,argentina,czech republic,3,spanish,greek


In [42]:
def same_language(row):
    language_home = row['language_home']
    language_checkin = row['language_checkin']
    if(language_home == language_checkin):
        return(1)
    else:
        return(0)

In [43]:
gowalla_trips['same_language'] = gowalla_trips.apply(lambda row : same_language(row), axis = 1)
brightkite_trips['same_language'] = brightkite_trips.apply(lambda row : same_language(row), axis = 1)

In [44]:
gowalla_trips.head(20)

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin,same_language
0,afghanistan,belgium,1,dari,dutch,0
1,afghanistan,united arab emirates,1,dari,arabic,0
2,afghanistan,united kingdom,1,dari,english,0
3,albania,hungary,1,albanian,hungarian,0
4,albania,spain,1,albanian,castilian,0
5,argentina,australia,1,spanish,english,0
6,argentina,chile,2,spanish,spanish,1
7,argentina,uruguay,1,spanish,spanish,1
8,australia,argentina,1,english,spanish,0
9,australia,bahamas,1,english,azerbaijani,0


In [45]:
brightkite_trips.head()

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin,same_language
0,algeria,france,1,arabic,french,0
1,angola,portugal,1,portuguese,portuguese,1
2,argentina,benin,2,spanish,french,0
3,argentina,brazil,1,spanish,portuguese,0
4,argentina,czech republic,3,spanish,greek,0


The distance between two countries is defined as the mean between the distances done by the users between the two countries (distance between the check-in in the foreign country and the home location).   

In [46]:
gowalla_far_checkins[(gowalla_far_checkins.country_home == 'united states') & (gowalla_far_checkins.country_checkin == 'brazil')].distance_from_home.mean()

9149.11447292096

In [47]:
def gowalla_distance_between_countries(row):
    '''returns the distance between 2 countries as defined previusly (Gowalla dataset)'''
    country_home = row['country_home']
    country_checkin = row['country_checkin']
    distance = gowalla_far_checkins[(gowalla_far_checkins.country_home == country_home) & 
                                    (gowalla_far_checkins.country_checkin == country_checkin)].distance_from_home.mean()
    return(distance)

In [48]:
def brightkite_distance_between_countries(row):
    '''returns the distance between 2 countries as defined previusly (Brightkite dataset)'''
    country_home = row['country_home']
    country_checkin = row['country_checkin']
    distance = brightkite_far_checkins[(brightkite_far_checkins.country_home == country_home) & 
                                    (brightkite_far_checkins.country_checkin == country_checkin)].distance_from_home.mean()
    return(distance)

In [50]:
gowalla_trips['distance'] = gowalla_trips.apply(lambda row: gowalla_distance_between_countries(row), axis = 1)

In [51]:
brightkite_trips['distance'] = brightkite_trips.apply(lambda row: brightkite_distance_between_countries(row), axis = 1)

In [52]:
gowalla_trips.sample(5)

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin,same_language,distance
2095,united kingdom,brazil,6,english,portuguese,0,9365.456214
935,italy,australia,1,italian,english,0,16317.280649
1340,norway,slovenia,2,norwegian,slovene,0,1556.691558
770,holy see (vatican city state),ireland,1,,english,0,1982.839261
1018,japan,switzerland,3,japanese,german,0,9576.93241


In [53]:
# we will include the difference of GDP between the departure and arrival countries to see (thanks to a logistic regression)
# if it is a factor for people to move. 

In [54]:
countries_gdp['country'] = countries_gdp['country'].str.lower()
countries_gdp['country'] = countries_gdp['country'].str.strip()
countries_gdp.head(3) # gdp per capita

Unnamed: 0,country,gdp
0,afghanistan,700.0
1,albania,4500.0
2,algeria,6000.0


In [55]:
gowalla_trips = pd.merge(gowalla_trips, countries_gdp, how = 'left', left_on = 'country_home', right_on = 'country')
gowalla_trips = pd.merge(gowalla_trips, countries_gdp, how = 'left', left_on = 'country_checkin', right_on = 'country', suffixes=('','_arrival'))
gowalla_trips['gdp_difference'] = gowalla_trips['gdp'] - gowalla_trips['gdp_arrival']
gowalla_trips.drop(columns = ['country', 'gdp', 'country_arrival', 'gdp_arrival'], inplace = True)

brightkite_trips = pd.merge(brightkite_trips, countries_gdp, how = 'left', left_on = 'country_home', right_on = 'country')
brightkite_trips = pd.merge(brightkite_trips, countries_gdp, how = 'left', left_on = 'country_checkin', right_on = 'country', suffixes=('','_arrival'))
brightkite_trips['gdp_difference'] = brightkite_trips['gdp'] - brightkite_trips['gdp_arrival']
brightkite_trips.drop(columns = ['country', 'gdp', 'country_arrival', 'gdp_arrival'], inplace = True)

In [56]:
brightkite_trips.sample(3)

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin,same_language,distance,gdp_difference
1835,switzerland,liberia,1,german,english,0,4724.597369,31700.0
1054,japan,norway,8,japanese,norwegian,0,8405.990317,-9600.0
1863,syrian arab republic,austria,1,,german,0,2461.901479,


In [57]:
gowalla_trips.sample(3)

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin,same_language,distance,gdp_difference
1075,lao people's democratic republic,malaysia,3,,bahasa,0,1895.336978,
1314,norway,malawi,1,norwegian,english,0,8357.696674,37200.0
1931,switzerland,portugal,6,german,portuguese,0,1680.694553,14700.0


In [59]:
gowalla_trips.to_csv('gowalla_trips_full_infos.csv')
brightkite_trips.to_csv('brightkite_trips_full_infos.csv')

# Linear regression

In [3]:
gowalla_trips_full.head()

Unnamed: 0.1,Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin,same_language,distance,gdp_difference
0,0,afghanistan,belgium,1,dari,dutch,0,5400.722789,-28400.0
1,1,afghanistan,united arab emirates,1,dari,arabic,0,1684.946973,-22500.0
2,2,afghanistan,united kingdom,1,dari,english,0,5707.038313,-27000.0
3,3,albania,hungary,1,albanian,hungarian,0,695.21769,-9400.0
4,4,albania,spain,1,albanian,castilian,0,2143.897129,-17500.0


In [4]:
gowalla_trips_full['distance'] = (gowalla_trips_full['distance'] - gowalla_trips_full['distance'].mean())/gowalla_trips_full['distance'].std()
gowalla_trips_full['gdp_difference'] = (gowalla_trips_full['gdp_difference'] - gowalla_trips_full['gdp_difference'].mean())/gowalla_trips_full['gdp_difference'].std()

In [10]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
mod = smf.ols(formula='user ~  distance + gdp_difference + C(same_language) \
              + I(distance**2) + I(gdp_difference**2) + I(distance**3) + I(gdp_difference**3) \
              + I(distance**4) + I(gdp_difference**4) + I(distance**5) + I(gdp_difference**5)', data=gowalla_trips_full)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   user   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     4.838
Date:                Wed, 16 Dec 2020   Prob (F-statistic):           2.10e-07
Time:                        22:35:07   Log-Likelihood:                -10639.
No. Observations:                1953   AIC:                         2.130e+04
Df Residuals:                    1941   BIC:                         2.137e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 16

In [12]:
brightkite_trips_full['distance'] = (brightkite_trips_full['distance'] - brightkite_trips_full['distance'].mean())/brightkite_trips_full['distance'].std()
brightkite_trips_full['gdp_difference'] = (brightkite_trips_full['gdp_difference'] - brightkite_trips_full['gdp_difference'].mean())/brightkite_trips_full['gdp_difference'].std()

In [13]:
mod = smf.ols(formula='user ~  distance + gdp_difference + C(same_language) \
              + I(distance**2) + I(gdp_difference**2) + I(distance**3) + I(gdp_difference**3) \
              + I(distance**4) + I(gdp_difference**4) + I(distance**5) + I(gdp_difference**5)', data=brightkite_trips_full)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   user   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     5.022
Date:                Wed, 16 Dec 2020   Prob (F-statistic):           9.24e-08
Time:                        22:36:10   Log-Likelihood:                -9412.0
No. Observations:                1916   AIC:                         1.885e+04
Df Residuals:                    1904   BIC:                         1.891e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 11

# Comparison United-States vs Europe

- Average distance 
- Part of the population going out of the country to travel 
- Number of visited country per traveller  

# Visualization 

In [None]:
import folium
def make_map_with_markers(data, factor):
    # Function outputting a world map in html format with the markers place at the
    # lattitude and longitudes indicated in the corresponding columns in data.
    # Inputs:
    # - data: dataframe with 3 columns and as many rows as number of markers
    #         to place, the columns are:
    #        - longitude (['lon']): longitude of the center of the marker
    #        - lattitude (['lat']): latitude of the center of the marker
    #        - value (['value']): number proprtional to the radius of the marker, it
    #                             should be between 0 and 100.
    #         
    m = folium.Map(location=[20,0], zoom_start=2)

    # I can add marker one by one on the map
    for i in range(0,len(data)):
       folium.Circle(
          location=[data.iloc[i]['latitude'], data.iloc[i]['longitude']],
          radius=float(data.iloc[i]['count_normalized']*factor),
          color='crimson',
          fill=True,
          fill_color='crimson').add_to(m)

    # Save it as html
    #m.save('mymap.html')
    return m

In [None]:
CENTROIDS = DATA_FOLDER+"centroids.xlsx"
centroids= pd.read_excel(CENTROIDS)

In [None]:
centroids=centroids.drop('country',axis=1)
centroids['name']=centroids.name.str.lower()
centroids.head()

In [None]:
b_countries_graph=brightkite_countries.merge(centroids, left_on='country', right_on='name').drop('name',axis=1)
b_countries_graph.sort_values('count_normalized',ascending=False).head(10)

In [None]:
m=make_map_with_markers(b_countries_graph, 3*10**9)
m

In [None]:
g_countries_graph=gowalla_countries.merge(centroids, left_on='country', right_on='name').drop('name',axis=1)
g_countries_graph.sort_values('count_normalized',ascending=False).head(10)

In [None]:
m=make_map_with_markers(g_countries_graph,4*10**8)
m

In [None]:
#To visualize all users' homes in the world
v=np.ones(len(brightkite_home_locations))
brightkite_home_locations['count_normalized']=v.tolist()
brightkite_home_locations.rename(columns = {'lat':'latitude','lon':'longitude'}, inplace = True)

In [None]:
m=make_map_with_markers(brightkite_home_locations,40000)
m.save('mymap.html')

## Step 3 :  total trips between countries

In [None]:
def not_home_country(row):
    '''return 1 if the user is in another country than his home and 0 if not'''
    country = row['country_checkin']
    home_country = row['country_home']
    if(country != home_country):
        return(1)
    else:
        return(0)

In [None]:
brightkite_far_checkins['not_home_country'] = brightkite_far_checkins.apply(lambda row: not_home_country(row), axis = 1)
gowalla_far_checkins['not_home_country'] = gowalla_far_checkins.apply(lambda row: not_home_country(row), axis = 1)

In [None]:
brightkite_trips = brightkite_far_checkins[(brightkite_far_checkins.change_arrival_country == 1) & (brightkite_far_checkins.not_home_country == 1)]
country_brightkite_trips = brightkite_trips[['country_home', 'country_checkin', 'user']].groupby(['country_home', 'country_checkin']).count()
country_brightkite_trips.to_csv('country_brightkite_trips.csv')

In [None]:
gowalla_trips = gowalla_far_checkins[(gowalla_far_checkins.change_arrival_country == 1) & (gowalla_far_checkins.not_home_country == 1)]
country_gowalla_trips = gowalla_trips[['country_home', 'country_checkin', 'user']].groupby(['country_home', 'country_checkin']).count()
country_gowalla_trips.to_csv('country_gowalla_trips.csv')

In [None]:
selected_countries = list(countries['country'])
def selected_country(row):
    '''return 1 if the user home country is well represented and 0 if not'''
    
    country = row['country_home']
    if country in selected_countries:
        return(1)
    else:
        return(0)

In [None]:
testG = country_gowalla_trips.reset_index(drop = True)
testG['selected_country'] = testG.apply(lambda row: selected_country(row), axis = 1)
testG = testG[testG['selected_country'] == 1]
testB = country_brightkite_trips.reset_index(drop = True)
testB['selected_country'] = testB.apply(lambda row: selected_country(row), axis = 1)
testG = testG[testG['selected_country'] == 1]

In [None]:
import networkx as nx

In [None]:
G = nx.DiGraph()
edgesG = []
for i in range(len(testG)):
    edgesG += [(testG.iat[i, 0].split(',')[0], testG.iat[i, 1].split(',')[0], testG.iat[i, 2])]
G.add_weighted_edges_from(edgesG)

In [None]:
B = nx.DiGraph()
edgesB = []
weights = []
for i in range(len(testB)):
    edgesB += [(testB.iat[i, 0].split(',')[0], testB.iat[i, 1].split(',')[0], testB.iat[i, 2])]
B.add_weighted_edges_from(edgesB)

In [None]:
nx.write_weighted_edgelist(G, 'GowallaTrips.edgelist', delimiter = ',')

In [None]:
nx.write_weighted_edgelist(B, 'BrightkiteTrips.edgelist', delimiter = ',')

In [None]:
H = nx.read_weighted_edgelist('GowallaTrips.edgelist', delimiter = ',', create_using=nx.DiGraph())
H.edges(data = True)

In [None]:
edge_width = [H[u][v]['weight'] for u, v in H.edges()] 
nx.draw_networkx(H,  width = edge_width)

In [None]:
#selecting the biggest flows
bigG = country_gowalla_trips[country_gowalla_trips['user']>200]
G2 = nx.DiGraph()
edgesG2 = []
for i in range(len(bigG)):
    edgesG += [(bigG.iat[i, 0].split(',')[0], bigG.iat[i, 1].split(',')[0], bigG.iat[i, 2])]
G2.add_weighted_edges_from(edgesG2)
edge_width = [G2[u][v]['weight'] for u, v in G2.edges()] 
nx.draw_networkx(G2,  width = edge_width)