In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools

In [3]:
homes_B = pd.read_pickle("B_homes.pickle")
homes_G = pd.read_pickle("G_homes.pickle")

names_edges = ["from_id", "to_id"]

edges_B = pd.read_table("Brightkite_edges.txt", names=names_edges).dropna()
edges_G = pd.read_table("Gowalla_edges.txt", names=names_edges).dropna()

# since the edges are bidrectional (and we only need one direction) we remove half the 
# values
edges_B[["from_id", "to_id"]] = np.sort(edges_B[["from_id", "to_id"]].values, axis=1)
edges_B = edges_B.drop_duplicates()
edges_G[["from_id", "to_id"]] = np.sort(edges_G[["from_id", "to_id"]].values, axis=1)
edges_G = edges_G.drop_duplicates()

edges_B

Unnamed: 0,from_id,to_id
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5
...,...,...
428142,58219,58224
428144,58220,58225
428145,58220,58226
428151,58225,58226


# Go through the edges dataframe and replace user id's with corresponding country codes

In [4]:
from_country = []
to_country = []
for row in tqdm(edges_B.iterrows()):
    from_id = row[1]["from_id"]
    to_id = row[1]["to_id"]
    if from_id in homes_B.index and to_id in homes_B.index: # TODO leave out if no home addresses
        from_country.append(homes_B.loc[from_id]["country"][0])
        to_country.append(homes_B.loc[to_id]["country"][0])

214078it [01:49, 1947.96it/s]


### Build Dataframe and Count Occurrences

In [5]:
edges_countries = pd.DataFrame(list(zip(from_country, to_country)), 
                                   columns =["from_country", "to_country"]) 
# count 
edges_countries = edges_countries.groupby(["from_country", "to_country"]).size().reset_index(name='Count')
# remove country to oneself
edges_countries = edges_countries[edges_countries['from_country'] != edges_countries['to_country']]
edges_countries

Unnamed: 0,from_country,to_country,Count
1,AE,AT,1
2,AE,AU,3
3,AE,BH,1
4,AE,BR,1
5,AE,CA,1
...,...,...,...
1798,ZA,RU,1
1799,ZA,SD,1
1800,ZA,TH,1
1801,ZA,US,12


In [6]:
edges_countries[edges_countries["from_country"] == "US"].sort_values(["Count"], ascending=False)

Unnamed: 0,from_country,to_country,Count
1646,US,GB,2851
1625,US,CA,2096
1612,US,AU,1423
1669,US,JP,1244
1635,US,DE,676
...,...,...,...
1721,US,SN,1
1666,US,JE,1
1692,US,MZ,1
1725,US,TD,1


# Religions & Languages

In [47]:
countries = pd.read_pickle("countries.pkl")
countries

Unnamed: 0,name,alpha_2,alpha_3,languages,main_religion
0,Afghanistan,AF,AFG,"[Afghan, Pashto, Uzbek]",Islam
1,Albania,AL,ALB,"[Albanian, Greek]",Islam
2,Algeria,DZ,DZA,"[Arabic, French, Berber]",Islam
3,Andorra,AD,AND,"[Catalan, French, Castilian]",Christianity
4,Angola,AO,AGO,"[Portuguese, Umbundu, Kikongo]",Christianity
...,...,...,...,...,...
192,"Korea, Democratic People's Republic of",KP,PRK,[Korean],Buddhism
193,"Korea, Republic of",KR,KOR,"[Korean, English, junior]",Christianity
194,Lao People's Democratic Republic,LA,LAO,"[Lao, French, English]",Buddhism
195,"Palestine, State of",PS,PSE,"[Palauan, English]",Islam


### Religions - Looking at Home Locations

## Brightkite

In [50]:
# check how many likelihood of living in a country for a given religion
unique_religions = list(set(countries["main_religion"]))
count_religions = {religion:0 for religion in unique_religions}
for country in tqdm(list(homes_B["country"])):
    for religion in countries[countries["alpha_2"]==country]["main_religion"]:
        count_religions[religion] += 1
count_religions = {k: v/len(homes_B) for k, v in sorted(count_religions.items(), key=lambda item: item[1])}
count_religions

100%|██████████| 50686/50686 [00:12<00:00, 4164.26it/s]


{'Hinduism': 0.005247997474647831,
 'Islam': 0.01594128556208815,
 'Buddhism': 0.07374817503847216,
 'Christianity': 0.9008996567099397}

## Gowalla

In [51]:
# check how many likelihood of living in a country for a given religion
unique_religions = list(set(countries["main_religion"]))
count_religions = {religion:0 for religion in unique_religions}
for country in tqdm(list(homes_G["country"])):
    for religion in countries[countries["alpha_2"]==country]["main_religion"]:
        count_religions[religion] += 1
count_religions = {k: v/len(homes_G) for k, v in sorted(count_religions.items(), key=lambda item: item[1])}
count_religions

100%|██████████| 107092/107092 [00:25<00:00, 4237.36it/s]


{'Hinduism': 0.0011578829417696933,
 'Islam': 0.03017965861128749,
 'Buddhism': 0.03673477010420947,
 'Christianity': 0.9232715795764389}