In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools

In [6]:
homes_B = pd.read_pickle("data/B_homes.pickle")
homes_G = pd.read_pickle("data/G_homes.pickle")

names_edges = ["from_id", "to_id"]

edges_B = pd.read_table("Brightkite_edges.txt", names=names_edges).dropna()
edges_G = pd.read_table("Gowalla_edges.txt", names=names_edges).dropna()

homes_B

Unnamed: 0_level_0,longitude,latitude,country
Unnamed: 0_level_1,mean,mean,Unnamed: 3_level_1
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,-105.009140,39.756108,US
1,-122.353819,37.581821,US
2,-105.041977,39.771737,US
3,-122.424280,37.765768,US
4,24.937299,60.173985,FI
...,...,...,...
58222,35.833333,33.833333,LB
58224,35.833333,33.833333,LB
58225,35.833333,33.833333,LB
58226,35.833333,33.833333,LB


# Go through the edges dataframe and replace user id's with corresponding country codes

In [7]:
from_country = []
to_country = []
for row in tqdm(edges_B.iterrows()):
    from_id = row[1]["from_id"]
    to_id = row[1]["to_id"]
    if from_id in homes_B.index and to_id in homes_B.index: # TODO leave out if no home addresses
        from_country.append(homes_B.loc[from_id]["country"][0])
        to_country.append(homes_B.loc[to_id]["country"][0])

428156it [06:56, 1028.04it/s]


### Build Dataframe and Count Occurrences

In [8]:
edges_countries = pd.DataFrame(list(zip(from_country, to_country)), 
                                   columns =["from_country", "to_country"]) 
# count 
edges_countries = edges_countries.groupby(["from_country", "to_country"]).size().reset_index(name='Count')
# remove country to oneself
edges_countries = edges_countries[edges_countries['from_country'] != edges_countries['to_country']]
edges_countries

Unnamed: 0,from_country,to_country,Count
1,AE,AT,3
2,AE,AU,3
3,AE,BH,1
4,AE,BR,4
5,AE,CA,2
...,...,...,...
2461,ZA,TH,1
2462,ZA,US,93
2464,ZA,ZM,3
2465,ZM,ZA,3


In [9]:
edges_countries[edges_countries["from_country"] == "US"].sort_values(["Count"], ascending=False)

Unnamed: 0,from_country,to_country,Count
2268,US,GB,4489
2247,US,CA,3221
2291,US,JP,2599
2233,US,AU,2058
2318,US,NL,1065
...,...,...,...
2288,US,JE,1
2338,US,SD,1
2304,US,MD,1
2299,US,LT,1


# Religions & Languages

In [10]:
countries = pd.read_pickle("data/countries.pkl")
countries

Unnamed: 0,name,alpha_2,alpha_3,languages,main_religion
0,Afghanistan,AF,AFG,"[Afghan, Pashto, Uzbek]",Islam
1,Albania,AL,ALB,"[Albanian, Greek]",Islam
2,Algeria,DZ,DZA,"[Arabic, French, Berber]",Islam
3,Andorra,AD,AND,"[Catalan, French, Castilian]",Christianity
4,Angola,AO,AGO,"[Portuguese, Umbundu, Kikongo]",Christianity
...,...,...,...,...,...
192,"Korea, Democratic People's Republic of",KP,PRK,[Korean],Buddhism
193,"Korea, Republic of",KR,KOR,"[Korean, English, junior]",Christianity
194,Lao People's Democratic Republic,LA,LAO,"[Lao, French, English]",Buddhism
195,"Palestine, State of",PS,PSE,"[Palauan, English]",Islam


### Religions - Looking at Home Locations

## Brightkite

In [11]:
# check how many likelihood of living in a country for a given religion
unique_religions = list(set(countries["main_religion"]))
count_religions = {religion:0 for religion in unique_religions}
for country in tqdm(list(homes_B["country"])):
    for religion in countries[countries["alpha_2"]==country]["main_religion"]:
        count_religions[religion] += 1
count_religions = {k: v/len(homes_B) for k, v in sorted(count_religions.items(), key=lambda item: item[1])}
count_religions

100%|██████████| 50686/50686 [00:25<00:00, 2017.71it/s]


{'Hinduism': 0.005247997474647831,
 'Islam': 0.01594128556208815,
 'Buddhism': 0.07374817503847216,
 'Christianity': 0.9008996567099397}

## Gowalla

In [12]:
# check how many likelihood of living in a country for a given religion
unique_religions = list(set(countries["main_religion"]))
count_religions = {religion:0 for religion in unique_religions}
for country in tqdm(list(homes_G["country"])):
    for religion in countries[countries["alpha_2"]==country]["main_religion"]:
        count_religions[religion] += 1
count_religions = {k: v/len(homes_G) for k, v in sorted(count_religions.items(), key=lambda item: item[1])}
count_religions

100%|██████████| 107092/107092 [00:51<00:00, 2072.43it/s]


{'Hinduism': 0.0011578829417696933,
 'Islam': 0.03017965861128749,
 'Buddhism': 0.03673477010420947,
 'Christianity': 0.9232715795764389}

# For each user, check how many of his friends have what religion

We will create a dataframe here, we for each friendship, we will have the information of both friends

In [13]:
# add languages and religion to homes
merged = pd.merge(homes_B, countries[["alpha_2", "languages", "main_religion"]], how="inner", left_on="country", right_on="alpha_2")
merged.columns = ["drop1", "drop2", "country", "drop3", "languages", "main_religion"]
merged = merged.drop(columns=["drop1", "drop2", "drop3"])
# add friend ids to home locations
merged = pd.merge(merged, edges_B, how="inner", left_index=True, right_on="from_id").drop(columns=["to_id"])
merged.columns = ["country", "languages", "main_religion", "friend"]
# add friend home country
merged = pd.merge(merged, homes_B["country"], how="inner", left_on="friend", right_index=True)
merged.columns = ["country", "languages", "main_religion", "friend", "friend_country"]
# add friend languages and religion
merged = pd.merge(merged, countries[["alpha_2", "languages", "main_religion"]], how="inner", left_on="friend_country", right_on="alpha_2")
merged.columns = ["country", "languages", "main_religion", "friend", "friend_country", "drop1", "friend_languages", "friend_main_religion"]
merged = merged.drop(columns=["drop1", "friend"])
merged



Unnamed: 0,country,languages,main_religion,friend_country,friend_languages,friend_main_religion
0,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity
1,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity
2,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity
3,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity
4,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity
...,...,...,...,...,...,...
395732,IE,"[English, Irish]",Christianity,LT,"[Lithuanian, Russian, Polish]",Christianity
395733,IT,"[Italian, German, French]",Christianity,SC,"[Seychellois, English, French]",Christianity
395734,DK,"[Danish, Faroese, Greenlandic]",Christianity,AL,"[Albanian, Greek]",Islam
395735,DK,"[Danish, Faroese, Greenlandic]",Christianity,AL,"[Albanian, Greek]",Islam


#### percentage of friendships in same country

In [14]:
len(merged[merged["country"] == merged["friend_country"]])/len(merged)

0.5623962379054777

##### percentage of friendships with same religion

In [15]:
len(merged[merged["main_religion"] == merged["friend_main_religion"]])/len(merged)

0.8340791990640248

In [16]:
merged.groupby(['main_religion', 'friend_main_religion']).size().sort_values(ascending=False)

main_religion  friend_main_religion
Christianity   Christianity            329473
               Buddhism                 52212
Buddhism       Christianity              7285
Christianity   Islam                     3054
               Hinduism                  1338
Islam          Christianity              1028
Buddhism       Buddhism                   577
Hinduism       Christianity               415
Islam          Buddhism                   145
Buddhism       Islam                      123
               Hinduism                    39
Islam          Islam                       19
               Hinduism                     8
Hinduism       Islam                        8
               Hinduism                     7
               Buddhism                     6
dtype: int64

What if we dont consider friendships in the same country

In [17]:
merged[merged["country"] != merged["friend_country"]].groupby(['main_religion', 'friend_main_religion']).size().sort_values(ascending=False)

main_religion  friend_main_religion
Christianity   Christianity            107283
               Buddhism                 52212
Buddhism       Christianity              7285
Christianity   Islam                     3054
               Hinduism                  1338
Islam          Christianity              1028
Hinduism       Christianity               415
Buddhism       Buddhism                   213
Islam          Buddhism                   145
Buddhism       Islam                      123
               Hinduism                    39
Islam          Islam                       19
               Hinduism                     8
Hinduism       Islam                        8
               Buddhism                     6
dtype: int64

##### percentage friends that have common language

In [18]:
merged["common_language"] = merged.apply(lambda row: sum(x in row[4] for x in row[1]), axis=1)
merged

Unnamed: 0,country,languages,main_religion,friend_country,friend_languages,friend_main_religion,common_language
0,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity,3
1,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity,3
2,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity,3
3,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity,3
4,US,"[English, Spanish, Chinese]",Christianity,US,"[English, Spanish, Chinese]",Christianity,3
...,...,...,...,...,...,...,...
395732,IE,"[English, Irish]",Christianity,LT,"[Lithuanian, Russian, Polish]",Christianity,0
395733,IT,"[Italian, German, French]",Christianity,SC,"[Seychellois, English, French]",Christianity,1
395734,DK,"[Danish, Faroese, Greenlandic]",Christianity,AL,"[Albanian, Greek]",Islam,0
395735,DK,"[Danish, Faroese, Greenlandic]",Christianity,AL,"[Albanian, Greek]",Islam,0


Count amount of common languages, given that it is not the same country

In [19]:
merged[merged["country"] != merged["friend_country"]].value_counts("common_language")

common_language
0    120679
1     49854
2      2631
3        12
dtype: int64

### Plots

In [77]:
G = nx.Graph()

node_list = set(merged["main_religion"].to_list() + merged["friend_main_religion"].to_list())

for r in node_list:
    G.add_node(r)

In [78]:
G.nodes()

NodeView(('Buddhism', 'Hinduism', 'Christianity', 'Islam'))

In [79]:
for i,j in merged.iterrows():
    G.add_edges_from([(j["main_religion"],j["friend_main_religion"])])

In [80]:
pos = nx.spring_layout(G, k=1.5, iterations=50)

In [81]:
for n, p in pos.items():
    G.nodes[n]['pos'] = p

In [82]:
import plotly
import plotly.graph_objects as go

edge_trace = go.Scatter(
    x=[],
    y=[],
    line=dict(width=5.5,color='#888'),
    hoverinfo='none',
    mode='lines')

for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos']
    x1, y1 = G.nodes[edge[1]]['pos']
    edge_trace['x'] += tuple([x0, x1, None])
    edge_trace['y'] += tuple([y0, y1, None])

In [83]:
node_trace = go.Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='RdBu',
        reversescale=True,
        color=[],
        size=40,
        colorbar=dict(
            thickness=10,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=0)))

for node in G.nodes():
    x, y = G.nodes[node]['pos']
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])

In [None]:
for node, adjacencies in enumerate(G.adjacency()):
    node_trace['marker']['color']+=tuple([len(adjacencies[1])])
    node_info = adjacencies[0] +' # of connections: '+str(len(adjacencies[1]))
    node_trace['text']+=tuple([node_info])


In [None]:
#import plotly.offline.iplot as iplot
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>AT&T network connections',
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="No. of connections",
                    showarrow=False,
                    xref="paper", yref="paper") ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))


fig