# Milestone P4 : Creative Extension

## Libraries

In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime
import math
import matplotlib.image as mpimg
%matplotlib inline

## The data

In [65]:
DATA_FOLDER = 'data/'

GOWALLA_CHECKINS_DATASET = DATA_FOLDER+"G_loc_country.csv"
BRIGHTKITE_CHECKINS_DATASET = DATA_FOLDER+"B_loc_country.csv"
POPULATION_DATASET = DATA_FOLDER+"pop_per_country.csv"
BRIGHTKITE_HOME_LOCATIONS = DATA_FOLDER+"new_home_loc_B_country.csv"
GOWALLA_HOME_LOCATIONS = DATA_FOLDER+"new_home_loc_G_country.csv"
COUNTRY_CODES = DATA_FOLDER+"countrycodes.csv"
LANGUAGE_COUNTRIES = DATA_FOLDER+"language_list_by_country.xlsx"
LANGUAGE_COUNTRIES_BIS = DATA_FOLDER+"languages.xlsx"
BRIGHTKITE_TRIPS = DATA_FOLDER+"country_brightkite_trips.csv"
GOWALLA_TRIPS = DATA_FOLDER+"country_gowalla_trips.csv"

gowalla_checkins = pd.read_csv(GOWALLA_CHECKINS_DATASET, delimiter = ',', usecols = [1,2,3,4,5])
brightkite_checkins = pd.read_csv(BRIGHTKITE_CHECKINS_DATASET, delimiter = ',', usecols = [1,2,3,4,5])
population = pd.read_csv(POPULATION_DATASET, index_col = 0, names = ['country', 'population'], header = 0)
brightkite_home_locations = pd.read_csv(BRIGHTKITE_HOME_LOCATIONS, sep =',')
gowalla_home_locations = pd.read_csv(GOWALLA_HOME_LOCATIONS, sep =',')
country_codes = pd.read_csv(COUNTRY_CODES, sep =',')
languages = pd.read_excel(LANGUAGE_COUNTRIES, names = ['country', 'language'])
languages2 = pd.read_excel(LANGUAGE_COUNTRIES_BIS, names = ['country', 'language'])
brightkite_trips = pd.read_csv(BRIGHTKITE_TRIPS)
gowalla_trips = pd.read_csv(GOWALLA_TRIPS)

## Cleaning the data

In [3]:
population.sort_values(by = ['population'], ascending = False).head(10)

Unnamed: 0,country,population
24,china,1313973713
51,india,1095351995
127,united states,298444215
52,indonesia,245452739
15,brazil,188078227
93,pakistan,165803560
9,bangladesh,147365352
101,russia,142893540
91,nigeria,131859731
59,japan,127463611


In [4]:
brightkite_home_locations.head(3)

Unnamed: 0,user,lat,lon,country
0,0,39.747913,-104.968944,US
1,1,37.599899,-122.372723,US
2,2,39.738874,-104.954143,US


In [5]:
gowalla_home_locations.head(3)

Unnamed: 0,user,lat,lon,country
0,0,30.259167,-97.750223,US
1,1,50.381005,3.43071,FR
2,2,34.052243,-118.327641,US


In [6]:
country_codes.head()

Unnamed: 0,Name,Code
0,Afghanistan,AF
1,Åland Islands,AX
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


In [7]:
gowalla_home_locations = pd.merge(gowalla_home_locations, country_codes, left_on = 'country', right_on = 'Code', how = 'left')
gowalla_home_locations.drop(columns = ['country', 'Code'], inplace = True)
gowalla_home_locations.rename(columns = {'Name':'country'}, inplace = True)

brightkite_home_locations = pd.merge(brightkite_home_locations, country_codes, left_on = 'country', right_on = 'Code', how = 'left')
brightkite_home_locations.drop(columns = ['country', 'Code'], inplace = True)
brightkite_home_locations.rename(columns = {'Name':'country'}, inplace = True)

In [8]:
gowalla_checkins = pd.merge(gowalla_checkins, country_codes, left_on = 'country', right_on = 'Code', how = 'left')
gowalla_checkins.drop(columns = ['country', 'Code'], inplace = True)
gowalla_checkins.rename(columns = {'Name':'country'}, inplace = True)

brightkite_checkins = pd.merge(brightkite_checkins, country_codes, left_on = 'country', right_on = 'Code', how = 'left')
brightkite_checkins.drop(columns = ['country', 'Code'], inplace = True)
brightkite_checkins.rename(columns = {'Name':'country'}, inplace = True)

In [9]:
brightkite_home_locations.head()

Unnamed: 0,user,lat,lon,country
0,0,39.747913,-104.968944,United States
1,1,37.599899,-122.372723,United States
2,2,39.738874,-104.954143,United States
3,3,37.780875,-122.406171,United States
4,4,60.174186,24.937477,Finland


In [10]:
gowalla_checkins['time'] = pd.to_datetime(gowalla_checkins['time'])
brightkite_checkins['time'] = pd.to_datetime(brightkite_checkins['time'])

In [11]:
brightkite_checkins.head()

Unnamed: 0,user,time,lat,lon,country
0,0,2010-10-17 01:48:53+00:00,39.747652,-104.99251,United States
1,0,2010-10-16 06:02:04+00:00,39.891383,-105.070814,United States
2,0,2010-10-16 03:48:54+00:00,39.891077,-105.068532,United States
3,0,2010-10-14 18:25:51+00:00,39.750469,-104.999073,United States
4,0,2010-10-14 00:21:47+00:00,39.752713,-104.996337,United States


## Step 1 : defining the geographic boundaries of our study

Before starting the study, we have to determine with which countries it would relevant to work. To estimate the quantity of data available in each country, we compute the number of homes, as defined in the replication, (i.e. users) that are located in each country. 

The point is to work only on countries in which the amount of data seems to be enough to have robust conclusions. 

In [12]:
brightkite_home_locations['country'] = brightkite_home_locations['country'].str.lower()
brightkite_checkins['country'] = brightkite_checkins['country'].str.lower()

gowalla_home_locations['country'] = gowalla_home_locations['country'].str.lower()
gowalla_checkins['country'] = gowalla_checkins['country'].str.lower()

In [13]:
brightkite_homes_per_country = brightkite_home_locations.groupby(['country']).count().sort_values(by = ['lat'], ascending = False)
brightkite_homes_per_country.drop(columns = ['lon', 'user'], inplace = True)
brightkite_homes_per_country.rename(columns={'lat':'count'}, inplace = True)
brightkite_homes_per_country.reset_index(inplace = True)

gowalla_homes_per_country = gowalla_home_locations.groupby(['country']).count().sort_values(by = ['lat'], ascending = False)
gowalla_homes_per_country.drop(columns = ['lon', 'user'], inplace = True)
gowalla_homes_per_country.rename(columns={'lat':'count'}, inplace = True)
gowalla_homes_per_country.reset_index(inplace = True)

gowalla_homes_per_country.head(10)

Unnamed: 0,country,count
0,united states,50639
1,sweden,18029
2,united kingdom,5435
3,germany,5213
4,norway,3662
5,canada,2210
6,saudi arabia,1917
7,thailand,1660
8,belgium,1569
9,australia,1311


In [14]:
brightkite_homes_per_country.head(10)

Unnamed: 0,country,count
0,united states,30669
1,united kingdom,3243
2,japan,2860
3,australia,1452
4,canada,1409
5,germany,1344
6,sweden,833
7,netherlands,789
8,italy,614
9,norway,593


In [15]:
brightkite_homes_per_country = brightkite_homes_per_country.merge(population, how = 'inner', on = ['country'])
gowalla_homes_per_country = gowalla_homes_per_country.merge(population, how = 'inner', on = ['country'])

In [16]:
brightkite_homes_per_country.head(3)

Unnamed: 0,country,count,population
0,united states,30669,298444215
1,united kingdom,3243,60609153
2,japan,2860,127463611


In [17]:
gowalla_homes_per_country.head(3)

Unnamed: 0,country,count,population
0,united states,50639,298444215
1,sweden,18029,9016596
2,united kingdom,5435,60609153


The number of home per country is normalized by the population of the country. 

In [18]:
brightkite_homes_per_country['count_normalized'] = brightkite_homes_per_country['count']/brightkite_homes_per_country['population']
gowalla_homes_per_country['count_normalized'] = gowalla_homes_per_country['count']/gowalla_homes_per_country['population']

In [19]:
brightkite_homes_per_country.sort_values(by = 'count_normalized', ascending = False).head(10)

Unnamed: 0,country,count,population,count_normalized
9,norway,593,4610820,0.000129
0,united states,30669,298444215,0.000103
6,sweden,833,9016596,9.2e-05
40,luxembourg,40,474413,8.4e-05
3,australia,1452,20264082,7.2e-05
12,finland,367,5231372,7e-05
1,united kingdom,3243,60609153,5.4e-05
7,netherlands,789,16491461,4.8e-05
4,canada,1409,33098932,4.3e-05
36,estonia,53,1324333,4e-05


In [20]:
gowalla_homes_per_country.sort_values(by = 'count_normalized', ascending = False).head(10)

Unnamed: 0,country,count,population,count_normalized
1,sweden,18029,9016596,0.002
4,norway,3662,4610820,0.000794
29,luxembourg,172,474413,0.000363
0,united states,50639,298444215,0.00017
8,belgium,1569,10379067,0.000151
11,switzerland,994,7523934,0.000132
17,denmark,607,5450661,0.000111
2,united kingdom,5435,60609153,9e-05
21,singapore,392,4492150,8.7e-05
10,netherlands,1259,16491461,7.6e-05


Let's say that we will only work with countries which count_normalized value is higher to a certain rate. 

In [21]:
rate = 0.00001
brightkite_countries = brightkite_homes_per_country[brightkite_homes_per_country.count_normalized >= rate]
gowalla_countries = gowalla_homes_per_country[gowalla_homes_per_country.count_normalized >= rate]
print('With this rate,', len(brightkite_countries), 'countries are taken in the study for the Brightkite dataset.')
print('With this rate,', len(gowalla_countries), 'countries are taken in the study for the Gowalla dataset.')

With this rate, 28 countries are taken in the study for the Brightkite dataset.
With this rate, 38 countries are taken in the study for the Gowalla dataset.


In [22]:
countries = pd.merge(brightkite_countries, gowalla_countries, how = 'inner', on = ['country'], suffixes=('_brightkite', '_gowalla')) # countries that are ok both for the Gowalla and Brightkite datasets
print('With this rate,', len(countries), 'countries are taken in the study for both datasets.')

With this rate, 25 countries are taken in the study for both datasets.


In [23]:
countries.head(5)

Unnamed: 0,country,count_brightkite,population_brightkite,count_normalized_brightkite,count_gowalla,population_gowalla,count_normalized_gowalla
0,united states,30669,298444215,0.000103,50639,298444215,0.00017
1,united kingdom,3243,60609153,5.4e-05,5435,60609153,9e-05
2,australia,1452,20264082,7.2e-05,1311,20264082,6.5e-05
3,canada,1409,33098932,4.3e-05,2210,33098932,6.7e-05
4,germany,1344,82422299,1.6e-05,5213,82422299,6.3e-05


## Distance from home

In [24]:
gowalla_home_locations.rename(columns={'lat':'home_lat','lon':'home_lon'}, inplace = True)
brightkite_home_locations.rename(columns={'lat':'home_lat','lon':'home_lon'}, inplace = True)

In [25]:
gowalla_checkins = pd.merge(gowalla_checkins, gowalla_home_locations, how = 'left', on = ['user'], suffixes = ('_checkin', '_home'))
brightkite_checkins = pd.merge(brightkite_checkins, brightkite_home_locations, how = 'left', on = ['user'], suffixes = ('_checkin', '_home'))

In [26]:
import haversine as hvrs

In [27]:
def distance_from_home(row):
    '''computes the distance between the check-in and the user's home'''
    lat1 = row['lat']
    lat2 = row['home_lat']
    long1 = row['lon']
    long2 = row['home_lon']
    return(hvrs.haversine((lat1, long1), (lat2, long2)))

In [None]:
gowalla_checkins['distance_from_home'] = gowalla_checkins.apply(lambda row: distance_from_home(row), axis = 1)
brightkite_checkins['distance_from_home'] = brightkite_checkins.apply(lambda row: distance_from_home(row), axis = 1)

In [None]:
gowalla_checkins.head(3)

In [None]:
distance = 200 #km
gowalla_far_checkins = gowalla_checkins[gowalla_checkins.distance_from_home >= distance]
brightkite_far_checkins = brightkite_checkins[brightkite_checkins.distance_from_home >= distance]
len(gowalla_far_checkins), len(brightkite_far_checkins)

In [None]:
gowalla_far_checkins.head(3)

In [None]:
gowalla_far_checkins_per_country = gowalla_far_checkins.groupby(['country_home']).count().sort_values(by = 'user', ascending = False)
gowalla_far_checkins_per_country.rename(columns = {'user':'count'}, inplace = True)
gowalla_far_checkins_per_country.drop(gowalla_far_checkins_per_country.columns.difference(['count']), 1, inplace=True)
gowalla_far_checkins_per_country.head(10)

In [None]:
brightkite_far_checkins_per_country = brightkite_far_checkins.groupby(['country_home']).count().sort_values(by = 'user', ascending = False)
brightkite_far_checkins_per_country.rename(columns = {'user':'count'}, inplace = True)
brightkite_far_checkins_per_country.drop(brightkite_far_checkins_per_country.columns.difference(['count']), 1, inplace=True)
brightkite_far_checkins_per_country.head(10)

# Step 2 : countries where users travelled 

In [None]:
gowalla_far_checkins['previous_country_checkin'] = gowalla_far_checkins['country_checkin'].shift(1)
brightkite_far_checkins['previous_country_checkin'] = brightkite_far_checkins['country_checkin'].shift(1)

In [None]:
gowalla_far_checkins['next_country_checkin'] = gowalla_far_checkins['country_checkin'].shift(-1)
brightkite_far_checkins['next_country_checkin'] = brightkite_far_checkins['country_checkin'].shift(-1)

In [None]:
brightkite_far_checkins[3009:3013]

In [None]:
def change_arrival_country(row):
    '''arrival country : return 1 if their the user moved from a country to another since the last check-in and 0 if not'''
    country = row['country_checkin']
    previous_country = row['previous_country_checkin']
    if(country != previous_country):
        return(1)
    else:
        return(0)

In [None]:
def change_departure_country(row):
    '''departure country : return 1 if their the user moves from a country to another since the last check-in and 0 if not'''
    country = row['country_checkin']
    next_country = row['next_country_checkin']
    if(country != next_country):
        return(1)
    else:
        return(0)

In [None]:
brightkite_far_checkins[3010:3014]

In [None]:
brightkite_arrivals = brightkite_far_checkins[brightkite_far_checkins.change_arrival_country == 1].groupby(['country_checkin']).count().sort_values(by = 'user', ascending = False)
brightkite_arrivals.rename(columns = {'user':'count'}, inplace = True)
brightkite_arrivals.drop(brightkite_arrivals.columns.difference(['count']), 1, inplace=True)

gowalla_arrivals = gowalla_far_checkins[gowalla_far_checkins.change_arrival_country == 1].groupby(['country_checkin']).count().sort_values(by = 'user', ascending = False)
gowalla_arrivals.rename(columns = {'user':'count'}, inplace = True)
gowalla_arrivals.drop(gowalla_arrivals.columns.difference(['count']), 1, inplace=True)

In [None]:
brightkite_departures = brightkite_far_checkins[brightkite_far_checkins.change_departure_country == 1].groupby(['country_checkin']).count().sort_values(by = 'user', ascending = False)
brightkite_departures.rename(columns = {'user':'count'}, inplace = True)
brightkite_departures.drop(brightkite_departures.columns.difference(['count']), 1, inplace=True)

gowalla_departures = gowalla_far_checkins[gowalla_far_checkins.change_departure_country == 1].groupby(['country_checkin']).count().sort_values(by = 'user', ascending = False)
gowalla_departures.rename(columns = {'user':'count'}, inplace = True)
gowalla_departures.drop(gowalla_departures.columns.difference(['count']), 1, inplace=True)

In [None]:
brightkite_arrivals.head(20)

In [None]:
brightkite_departures.head(20)

In [None]:
list(brightkite_far_checkins[(brightkite_far_checkins.change_arrival_country == 1) & (brightkite_far_checkins.user == 15) & (brightkite_far_checkins.country_checkin != brightkite_far_checkins.country_home)].country_checkin)

In [None]:
def brightkite_visited_countries(row):
    '''returns a list of the countries that a user visited (Brightkite dataset)'''
    user = row['user']
    return(list(brightkite_far_checkins[(brightkite_far_checkins.change_arrival_country == 1) # there is a change of country
                                        & (brightkite_far_checkins.user == user) # for the given user 
                                        & (brightkite_far_checkins.country_checkin != brightkite_far_checkins.country_home)].country_checkin)) # and the arrival country is not the home country of the user

In [None]:
def gowalla_visited_countries(row):
    '''returns a list of the countries that a user visited (Gowalla dataset)'''
    user = row['user']
    return(list(gowalla_far_checkins[(gowalla_far_checkins.change_arrival_country == 1) # there is a change of country
                                        & (gowalla_far_checkins.user == user) # for the given user 
                                        & (gowalla_far_checkins.country_checkin != gowalla_far_checkins.country_home)].country_checkin)) # and the arrival country is not the home country of the user

In [None]:
gowalla_users_visited_countries = pd.DataFrame({'user':gowalla_far_checkins.user.unique()})
gowalla_users_visited_countries['visited_countries'] = gowalla_users_visited_countries.apply(lambda row: gowalla_visited_countries(row), axis = 1)
gowalla_users_visited_countries.head()

In [None]:
brightkite_users_visited_countries = pd.DataFrame({'user':brightkite_far_checkins.user.unique()})
brightkite_users_visited_countries['visited_countries'] = brightkite_users_visited_countries.apply(lambda row: brightkite_visited_countries(row), axis = 1)
brightkite_users_visited_countries.head()

# Languages

In [66]:
languages2['country'] = languages2['country'].str.strip()
languages2['country'] = languages2['country'].str.lower()
languages2['language'] = languages2['language'].str.lower()

In [67]:
languages2.head(20)

Unnamed: 0,country,language
0,afghanistan,afghan persian or dari (official) 77% (dari fu...
1,albania,albanian 98.8% (official - derived from tosk d...
2,algeria,"arabic (official), french (lingua franca), ber..."
3,andorra,"catalan (official), french, castilian, portuguese"
4,angola,"portuguese 71.2% (official), umbundu 23%, kiko..."
5,antigua and barbuda,"english (official), antiguan creole"
6,argentina,"spanish (official), italian, english, german, ..."
7,armenia,"armenian (official) 97.9%, kurdish (spoken by ..."
8,australia,"english 72.7%, mandarin 2.5%, arabic 1.4%, can..."
9,austria,"german (official nationwide) 88.6%, turkish 2...."


In [68]:
# Only keep the first word (either separated by a comma or a space from other words)
languages2['language'] = languages2['language'].str.partition(' ')[0]
languages2['language'] = languages2['language'].str.partition(',')[0]

In [69]:
languages2.head(20)

Unnamed: 0,country,language
0,afghanistan,afghan
1,albania,albanian
2,algeria,arabic
3,andorra,catalan
4,angola,portuguese
5,antigua and barbuda,english
6,argentina,spanish
7,armenia,armenian
8,australia,english
9,austria,german


In [70]:
languages2[languages2.country == 'belgium']

Unnamed: 0,country,language
16,belgium,dutch


In [71]:
languages2[languages2.language.str.contains('french') == True]

Unnamed: 0,country,language
18,benin,french
26,burkina faso,french
33,central african republic,french
34,chad,french
39,"congo, democratic republic of the",french
40,"congo, republic of",french
42,côte d'ivoire,french
48,djibouti,french
62,france,french
63,gabon,french


In [72]:
gowalla_trips = pd.merge(gowalla_trips, languages2, how = 'left', left_on = 'country_home', right_on = 'country')
gowalla_trips.drop(columns = 'country', inplace = True)
gowalla_trips.rename(columns = {'language':'language_home'}, inplace = True)

gowalla_trips = pd.merge(gowalla_trips, languages2, how = 'left', left_on = 'country_checkin', right_on = 'country')
gowalla_trips.drop(columns = 'country', inplace = True)
gowalla_trips.rename(columns = {'language':'language_checkin'}, inplace = True)

gowalla_trips.head()

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin
0,afghanistan,belgium,1,afghan,dutch
1,afghanistan,united arab emirates,1,afghan,arabic
2,afghanistan,united kingdom,1,afghan,english
3,albania,hungary,1,albanian,hungarian
4,albania,spain,1,albanian,castilian


In [77]:
brightkite_trips = pd.merge(brightkite_trips, languages2, how = 'left', left_on = 'country_home', right_on = 'country')
brightkite_trips.drop(columns = 'country', inplace = True)
brightkite_trips.rename(columns = {'language':'language_home'}, inplace = True)

brightkite_trips = pd.merge(brightkite_trips, languages2, how = 'left', left_on = 'country_checkin', right_on = 'country')
brightkite_trips.drop(columns = 'country', inplace = True)
brightkite_trips.rename(columns = {'language':'language_checkin'}, inplace = True)

brightkite_trips.head()

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin
0,algeria,france,1,arabic,french
1,angola,portugal,1,portuguese,portuguese
2,argentina,benin,2,spanish,french
3,argentina,brazil,1,spanish,portuguese
4,argentina,czech republic,3,spanish,greek


In [73]:
def same_language(row):
    language_home = row['language_home']
    language_checkin = row['language_checkin']
    if(language_home == language_checkin):
        return(1)
    else:
        return(0)

In [78]:
gowalla_trips['same_language'] = gowalla_trips.apply(lambda row : same_language(row), axis = 1)
brightkite_trips['same_language'] = brightkite_trips.apply(lambda row : same_language(row), axis = 1)

In [76]:
gowalla_trips.head(20)

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin,same_language
0,afghanistan,belgium,1,afghan,dutch,0
1,afghanistan,united arab emirates,1,afghan,arabic,0
2,afghanistan,united kingdom,1,afghan,english,0
3,albania,hungary,1,albanian,hungarian,0
4,albania,spain,1,albanian,castilian,0
5,argentina,australia,1,spanish,english,0
6,argentina,chile,2,spanish,spanish,1
7,argentina,uruguay,1,spanish,spanish,1
8,australia,argentina,1,english,spanish,0
9,australia,bahamas,1,english,azerbaijani,0


In [79]:
brightkite_trips.head()

Unnamed: 0,country_home,country_checkin,user,language_home,language_checkin,same_language
0,algeria,france,1,arabic,french,0
1,angola,portugal,1,portuguese,portuguese,1
2,argentina,benin,2,spanish,french,0
3,argentina,brazil,1,spanish,portuguese,0
4,argentina,czech republic,3,spanish,greek,0
