In [None]:
!pip install pycountry
!pip install pickle5

Collecting pycountry
[?25l  Downloading https://files.pythonhosted.org/packages/76/73/6f1a412f14f68c273feea29a6ea9b9f1e268177d32e0e69ad6790d306312/pycountry-20.7.3.tar.gz (10.1MB)
[K     |████████████████████████████████| 10.1MB 12.2MB/s 
[?25hBuilding wheels for collected packages: pycountry
  Building wheel for pycountry (setup.py) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-20.7.3-py2.py3-none-any.whl size=10746864 sha256=8024a2d7e7cfb357d46ec665b7fc604d68ce05853f54389c5c27b5de2bd4c36f
  Stored in directory: /root/.cache/pip/wheels/33/4e/a6/be297e6b83567e537bed9df4a93f8590ec01c1acfbcd405348
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-20.7.3
Collecting pickle5
[?25l  Downloading https://files.pythonhosted.org/packages/f7/4c/5c4dd0462c8d3a6bc4af500a6af240763c2ebd1efdc736fc2c946d44b70a/pickle5-0.0.11.tar.gz (132kB)
[K     |████████████████████████████████| 133kB 14.9MB/s 
[?25hBuilding wheels fo

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pycountry
import plotly.graph_objs as go
import plotly.express as px

import pickle5 as pickle
import os

Loading the dataframe containing each pair of friends (Gowalla and Brightkite together), with their respective country.


In [None]:
with open('./data/friends_countries.pkl', 'rb') as file:
     friends_countries = pickle.load(file)

In [None]:
friends_countries.head()

Unnamed: 0,user1,country_1,user2,country_2
0,5_g,US,0_g,US
1,7_g,US,0_g,US
2,19_g,US,0_g,US
3,26_g,US,0_g,US
4,27_g,US,0_g,US


We want to see for users of a given country the top countries with their friends.

First we get the number of friendshios between each pair of countries:

In [None]:
friends_countries = friends_countries.groupby(by=['country_1', 'country_2']).count()
friends_countries = friends_countries.reset_index()
friends_countries = friends_countries.drop(columns=['user1'])
friends_countries = friends_countries.sort_values(by=['country_1', 'user2'], ascending=False)

friends_countries = friends_countries.rename(columns={'user2':'number_friends'})
friends_countries.head(15)

Unnamed: 0,country_1,country_2,number_friends
3893,ZW,US,3
3892,ZW,CH,1
3891,ZM,US,77
3880,ZM,CA,3
3885,ZM,FR,2
3879,ZM,AU,1
3881,ZM,CN,1
3882,ZM,DE,1
3883,ZM,DK,1
3884,ZM,FI,1


We want to get the probabilities that a user of a given country has a friend in another country. To do this we count the total number of friendships from one country and use this value to divide all friendships between pairs of countries.

In [None]:
total_friends_country = friends_countries.groupby(by=['country_1']).sum().reset_index()
total_friends_country = total_friends_country .rename(columns={'number_friends':'total_number_friends'})
total_friends_country.head()

Unnamed: 0,country_1,total_number_friends
0,AD,1
1,AE,361
2,AF,17
3,AG,27
4,AL,3


We can put all the information together in order to get a dataframe containing each pair of countries linked by friendships, the number of friendships which links them, and the number of total friends the first country has.

In [None]:
friends_countries = friends_countries.merge(total_friends_country, on='country_1')
friends_countries.head()

Unnamed: 0,country_1,country_2,number_friends,total_number_friends
0,ZW,US,3,4
1,ZW,CH,1,4
2,ZM,US,77,89
3,ZM,CA,3,89
4,ZM,FR,2,89


We now want to compute the probability of each friendships for every country to know a bit the degree of afinity a country has with the others.

In [None]:
friends_countries['proba_friends'] = friends_countries['number_friends']/friends_countries['total_number_friends']
friends_countries.head()

Unnamed: 0,country_1,country_2,number_friends,total_number_friends,proba_friends
0,ZW,US,3,4,0.75
1,ZW,CH,1,4,0.25
2,ZM,US,77,89,0.865169
3,ZM,CA,3,89,0.033708
4,ZM,FR,2,89,0.022472


In [None]:
top_friendships = friends_countries.sort_values('proba_friends').groupby('country_1').head(10)

In [None]:
countries2letters_3letters = {}
for country in pycountry.countries:
    countries2letters_3letters[country.alpha_2] = country.alpha_3

countries2letters_name = {}
for country in pycountry.countries:
    countries2letters_name[country.alpha_2] = country.name

top_friendships['country_1_name'] = top_friendships.country_1.apply(lambda x : countries2letters_name.get(x, 'Unknown code') )
top_friendships['country_2_3letters'] = top_friendships.country_2.apply(lambda x : countries2letters_3letters.get(x, 'Unknown code'))
top_friendships['country_2_name'] = top_friendships.country_2.apply(lambda x : countries2letters_name.get(x, 'Unknown code'))
top_friendships['proba_friends'] = round(top_friendships.proba_friends*100, 2)

top_friendships.head()

Unnamed: 0,country_1,country_2,number_friends,total_number_friends,proba_friends,country_1_name,country_2_3letters,country_2_name
0,ZW,US,3,4,75.0,Zimbabwe,USA,United States
1,ZW,CH,1,4,25.0,Zimbabwe,CHE,Switzerland
2,ZM,US,77,89,86.52,Zambia,USA,United States
3,ZM,CA,3,89,3.37,Zambia,CAN,Canada
4,ZM,FR,2,89,2.25,Zambia,FRA,France


We check the top friendship countries of people living in Switzerland:

In [None]:
top_friendships[top_friendships.country_1_name =='Switzerland'].country_2_name

1010    USA
1011    GBR
1012    JPN
1013    CAN
1014    SWE
1015    DEU
1016    AUS
1017    ESP
1018    FRA
1019    CHE
Name: country_2_3letters, dtype: object

In [None]:
# Data
df = top_friendships

#We remove the entry with unknown code
idx_unknown_code = -9
countries_from = np.delete(np.unique(np.array(result_friendships.country_1_name)), idx_unknown_code)

# we need to add this to select which trace 
# is going to be visible
visible = countries_from

# define traces and buttons at once
traces = []
buttons = []
for country in countries_from:
    traces.append(go.Choropleth(
        locations=df[df.country_1_name == country]['country_2_3letters'], # Spatial coordinates
        colorbar_title="Percentage of friendships",
        z=df[df.country_1_name == country]['proba_friends'].astype(float), # Data to be color-coded
        visible= True if country=='Switzerland' else False,
        colorscale = 'mint',
        text = df[df.country_1_name == country]['country_2_name'] + 
                "<br>" + df[df.country_1_name == country]['proba_friends'].astype(str) + "% of friendships",
        hoverinfo = "text"
        ),
        )

    buttons.append(dict(label=country,
                        method="update",
                        args=[{"visible":list(visible==country)},
                              {"title":f"<b>Top {len(df[df.country_1_name == country])} friends' countries from people living in {country}</b>"}]))

#We make the button with Switzerland be the first visible
idx_CH = 149
updatemenus = [{"active":idx_CH,
                "buttons":buttons,
               }]


# Show figure
fig = go.Figure(data=traces,
                layout=dict(updatemenus=updatemenus))
fig.update_geos(projection_type="natural earth")

# This is in order to get the first title displayed correctly
first_title = 'Switzerland'
fig.update_layout(title=f"<b>Top 10 friends' countries from people living in {first_title}</b>",title_x=0.5)
fig.show()

In [None]:
fig.write_html("top10friends.html")