In [361]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools
import networkx as nx
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import math
from plotly.offline import plot

### load the data

In [340]:
# get home location
homes_B = pd.read_pickle("data/B_homes.pickle")
homes_G = pd.read_pickle("data/G_homes.pickle")
homes_B.columns = ["longitude", "latitude", "country"]
homes_G.columns = ["longitude", "latitude", "country"]

# drop coordinates
homes_B = homes_B.drop(columns=["longitude", "latitude"])
homes_G = homes_G.drop(columns=["longitude", "latitude"])

# edges
names_edges = ["from_id", "to_id"]
edges_B = pd.read_table("Brightkite_edges.txt", names=names_edges).dropna()
edges_G = pd.read_table("Gowalla_edges.txt", names=names_edges).dropna()

# country meta data
countries = pd.read_pickle("data/countries.pkl")

homes_B

Unnamed: 0_level_0,country
user_id,Unnamed: 1_level_1
0,US
1,US
2,US
3,US
4,FI
...,...
58222,LB
58224,LB
58225,LB
58226,LB


### go through the homes and add ones religion and language to it

In [341]:
# get the indexes (user_ids) as a column so it wont get lost when merging later
homes_B = homes_B.reset_index()
homes_G = homes_G.reset_index()

# add languages and religions
homes_B = homes_B.merge(countries[["alpha_2", "languages", "main_religion"]], how="inner", left_on="country", right_on="alpha_2").drop(columns=["alpha_2"], axis=1)
homes_G = homes_G.merge(countries[["alpha_2", "languages", "main_religion"]], how="inner", left_on="country", right_on="alpha_2").drop(columns=["alpha_2"], axis=1)

homes_B

Unnamed: 0,user_id,country,languages,main_religion
0,0,US,English,Protestant
1,1,US,English,Protestant
2,2,US,English,Protestant
3,3,US,English,Protestant
4,5,US,English,Protestant
...,...,...,...,...
50470,58226,LB,Arabic,Islam
50471,58227,LB,Arabic,Islam
50472,52716,SN,French,Islam
50473,53215,MR,Arabic,Islam


### remove bidirectional edges, i.e. remove double edges

In [342]:
edges_B[["from_id", "to_id"]] = np.sort(edges_B[["from_id", "to_id"]].values, axis=1)
edges_B = edges_B.drop_duplicates()
edges_G[["from_id", "to_id"]] = np.sort(edges_G[["from_id", "to_id"]].values, axis=1)
edges_G = edges_G.drop_duplicates()

edges_B

Unnamed: 0,from_id,to_id
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5
...,...,...
428142,58219,58224
428144,58220,58225
428145,58220,58226
428151,58225,58226


Create a dataframe that has one entry for each edge between friends with their metainformation

In [343]:
# add friend ids to home locations
merged_B = pd.merge(homes_B, edges_B, how="inner", left_on="user_id", right_on="from_id").drop(columns=["from_id"], axis=1)
merged_G = pd.merge(homes_G, edges_G, how="inner", left_on="user_id", right_on="from_id").drop(columns=["from_id"], axis=1)

# add friend information
merged_B = pd.merge(merged_B, homes_B, how="inner", left_on="to_id", right_on="user_id").drop(columns=["to_id"], axis=1)
merged_G = pd.merge(merged_G, homes_G, how="inner", left_on="to_id", right_on="user_id").drop(columns=["to_id"], axis=1)

# add new column names
merged_B.columns = ["id", "country", "language", "religion", "friend_id", "friend_country", "friend_language", "friend_religion" ]
merged_G.columns = ["id", "country", "language", "religion", "friend_id", "friend_country", "friend_language", "friend_religion" ]

merged_B

Unnamed: 0,id,country,language,religion,friend_id,friend_country,friend_language,friend_religion
0,0,US,English,Protestant,1,US,English,Protestant
1,0,US,English,Protestant,2,US,English,Protestant
2,0,US,English,Protestant,3,US,English,Protestant
3,1,US,English,Protestant,3,US,English,Protestant
4,0,US,English,Protestant,4,FI,Finnish,Evangelical
...,...,...,...,...,...,...,...,...
193126,58220,LB,Arabic,Islam,58225,LB,Arabic,Islam
193127,58220,LB,Arabic,Islam,58226,LB,Arabic,Islam
193128,58225,LB,Arabic,Islam,58226,LB,Arabic,Islam
193129,58225,LB,Arabic,Islam,58227,LB,Arabic,Islam


And now, get only international friends, i.e., where the countries are different.

In [344]:
international_B = merged_B[merged_B.country != merged_B.friend_country]
international_G = merged_G[merged_G.country != merged_G.friend_country]

international_B

Unnamed: 0,id,country,language,religion,friend_id,friend_country,friend_language,friend_religion
4,0,US,English,Protestant,4,FI,Finnish,Evangelical
5,1,US,English,Protestant,4,FI,Finnish,Evangelical
6,3,US,English,Protestant,4,FI,Finnish,Evangelical
9,4,FI,Finnish,Evangelical,5,US,English,Protestant
16,4,FI,Finnish,Evangelical,7,US,English,Protestant
...,...,...,...,...,...,...,...,...
193087,42108,KI,I-Kiribati,Roman Catholic,55105,IT,Italian,Roman Catholic
193088,42847,GN,French,Islam,55313,PT,Portuguese,Roman Catholic
193089,44750,LT,Lithuanian,Roman Catholic,55790,CY,Greek,Orthodox
193107,58152,LB,Arabic,Islam,58205,US,English,Protestant


Append them to one big one

In [345]:
international = international_B.append(international_G)

international

Unnamed: 0,id,country,language,religion,friend_id,friend_country,friend_language,friend_religion
4,0,US,English,Protestant,4,FI,Finnish,Evangelical
5,1,US,English,Protestant,4,FI,Finnish,Evangelical
6,3,US,English,Protestant,4,FI,Finnish,Evangelical
9,4,FI,Finnish,Evangelical,5,US,English,Protestant
16,4,FI,Finnish,Evangelical,7,US,English,Protestant
...,...,...,...,...,...,...,...,...
452969,42114,LT,Lithuanian,Roman Catholic,135702,RS,Serbian,Roman Catholic
452970,42114,LT,Lithuanian,Roman Catholic,135703,RS,Serbian,Roman Catholic
452971,42114,LT,Lithuanian,Roman Catholic,135704,BA,Bosnian,Islam
452974,85229,SN,French,Islam,146217,FR,French,Roman Catholic


### Religion Information DataFrame

In [346]:
# for creating a dataframe
from_list = []
to_list = []
count_list = []

# make it so we get the count of all combinations by using groupby
rel_count = international.groupby(['religion', 'friend_religion']).size()

# go through the religions and sort them so that (a,b) and (b,a) get mapped to the same value
# we dont care for order
for (form_r, to_r), count in rel_count.iteritems():
    [from_r, to_r] = sorted([form_r, to_r])
    from_list.append(from_r)
    to_list.append(to_r)
    count_list.append(count)
rel_count_df = pd.DataFrame(zip(from_list, to_list, count_list),
                                     columns =['from_religion', "to_religion", "count"]) 

# do it once more in order to get the ifnal values
from_list = []
to_list = []
count_list = []
rel_count_df = rel_count_df.groupby(["from_religion", "to_religion"])['count'].sum()
for (form_r, to_r), count in rel_count_df.iteritems():
    [from_r, to_r] = sorted([form_r, to_r])
    from_list.append(from_r)
    to_list.append(to_r)
    count_list.append(count)
rel_count_df = pd.DataFrame(zip(from_list, to_list, count_list),
                                     columns =['from_religion', "to_religion", "count"]).sort_values("count", ascending=False)

rel_count_df = rel_count_df.reset_index(drop = True)

rel_count_df

Unnamed: 0,from_religion,to_religion,count
0,Protestant,Roman Catholic,32212
1,Christianity,Protestant,23500
2,Buddhism,Protestant,7215
3,Christianity,Roman Catholic,6322
4,Protestant,Protestant,5674
5,Roman Catholic,Roman Catholic,4690
6,Islam,Protestant,3766
7,Evangelical,Protestant,3339
8,Christianity,Evangelical,2303
9,Buddhism,Roman Catholic,1838


calculate the relative amount, by dividing between all possible friendships between two values

In [347]:
relative = []

# get the (theoretical) maximum number of connections between two religions my multiplying the amount 
# of people that speak them
for indx, row in rel_count_df.iterrows():
    # possible friendships of both network
    total_possible_B = len(homes_B[homes_B.main_religion == row["from_religion"]]) * len(homes_B[homes_B.main_religion == row["to_religion"]])
    total_possible_G = len(homes_G[homes_G.main_religion == row["from_religion"]]) * len(homes_G[homes_G.main_religion == row["to_religion"]])
    # add them up
    total = total_possible_B + total_possible_G
    # divide by it
    relative.append(row["count"]/total)

rel_count_df["relative_count"] = relative

rel_count_df

Unnamed: 0,from_religion,to_religion,count,relative_count
0,Protestant,Roman Catholic,32212,3.2e-05
1,Christianity,Protestant,23500,1.6e-05
2,Buddhism,Protestant,7215,2.1e-05
3,Christianity,Roman Catholic,6322,1.7e-05
4,Protestant,Protestant,5674,1e-06
5,Roman Catholic,Roman Catholic,4690,1.9e-05
6,Islam,Protestant,3766,1.9e-05
7,Evangelical,Protestant,3339,1.1e-05
8,Christianity,Evangelical,2303,2e-05
9,Buddhism,Roman Catholic,1838,2.2e-05


now take this linear scale and map it to a logarithmic one

In [348]:
scaled_relative = []
# get the current mini and maximum
mini = rel_count_df.relative_count.min()
maxi = rel_count_df.relative_count.max()

# the new ones
new_mini = 1.
new_maxi = 4.

# scale them
for indx, row in rel_count_df.iterrows():
    scaled_relative.append((new_maxi-new_mini) * row["relative_count"] / maxi + new_mini)

# add them to df
rel_count_df["scaled_relative_count"] = scaled_relative

rel_count_df

Unnamed: 0,from_religion,to_religion,count,relative_count,scaled_relative_count
0,Protestant,Roman Catholic,32212,3.2e-05,1.400784
1,Christianity,Protestant,23500,1.6e-05,1.197323
2,Buddhism,Protestant,7215,2.1e-05,1.267096
3,Christianity,Roman Catholic,6322,1.7e-05,1.216297
4,Protestant,Protestant,5674,1e-06,1.0172
5,Roman Catholic,Roman Catholic,4690,1.9e-05,1.239407
6,Islam,Protestant,3766,1.9e-05,1.239451
7,Evangelical,Protestant,3339,1.1e-05,1.14393
8,Christianity,Evangelical,2303,2e-05,1.251489
9,Buddhism,Roman Catholic,1838,2.2e-05,1.280676


Add number of users who belong to this religion

In [349]:
# get list of unique religoins
unique_religions = set(rel_count_df["from_religion"].to_list() + rel_count_df["to_religion"].to_list())
# count them
nr_users_religion_og = {r: len(homes_B[homes_B.main_religion == r]) + len(homes_G[homes_G.main_religion == r]) for r in unique_religions}

nr_users_religion = {}

maxi = max(nr_users_religion_og.values())

# scale them
for k, v in nr_users_religion_og.items():
    nr_users_religion[k] = (new_maxi-new_mini) * v / maxi + new_mini


nr_users_religion

{'Anglican': 1.008396668638784,
 'Christianity': 1.9783142948163657,
 'Roman Catholic': 1.7268579620443272,
 'Hinduism': 1.0133117917444137,
 'Evangelical': 1.1960246666363263,
 'Protestant': 4.0,
 'Buddhism': 1.2618668365721568,
 'Orthodox': 1.0273745050744094,
 'Catholic': 1.002628225549538,
 'Islam': 1.1317867382696947}

### Language Information DataFrame

In [350]:
from_list = []
to_list = []
count_list = []

lang_count = international.groupby(['language', 'friend_language']).size()
for (form_r, to_r), count in lang_count.iteritems():
    [from_r, to_r] = sorted([form_r, to_r])
    from_list.append(from_r)
    to_list.append(to_r)
    count_list.append(count)
lang_count_df = pd.DataFrame(zip(from_list, to_list, count_list),
                                     columns =['from_language', "to_language", "count"]) 

from_list = []
to_list = []
count_list = []
lang_count_df = lang_count_df.groupby(["from_language", "to_language"])['count'].sum()
for (form_r, to_r), count in lang_count_df.iteritems():
    [from_r, to_r] = sorted([form_r, to_r])
    from_list.append(from_r)
    to_list.append(to_r)
    count_list.append(count)
lang_count_df = pd.DataFrame(zip(from_list, to_list, count_list),
                                     columns =['from_language', "to_language", "count"]).sort_values("count", ascending=False)

lang_count_df = lang_count_df.reset_index(drop = True)

lang_count_df

Unnamed: 0,from_language,to_language,count
0,English,English,34503
1,English,German,8853
2,English,Swedish,5881
3,Dutch,English,5034
4,English,Japanese,4277
...,...,...,...
672,Dhivehi,Russian,1
673,Dhivehi,Italian,1
674,Danish,Yue,1
675,Japanese,Serbian,1


###### calculate the relative amount, by dividing between all possible friendships between two values

In [351]:
relative = []

for indx, row in lang_count_df.iterrows():
    # possible friendships of both network
    total_possible_B = len(homes_B[homes_B.languages == row["from_language"]]) * len(homes_B[homes_B.languages == row["to_language"]])
    total_possible_G = len(homes_G[homes_G.languages == row["from_language"]]) * len(homes_G[homes_G.languages == row["to_language"]])
    # add them up
    total = total_possible_B + total_possible_G
    # divide by it
    relative.append(row["count"]/total)

lang_count_df["relative_count"] = relative

lang_count_df

Unnamed: 0,from_language,to_language,count,relative_count
0,English,English,34503,0.000007
1,English,German,8853,0.000019
2,English,Swedish,5881,0.000005
3,Dutch,English,5034,0.000024
4,English,Japanese,4277,0.000027
...,...,...,...,...
672,Dhivehi,Russian,1,0.000490
673,Dhivehi,Italian,1,0.000108
674,Danish,Yue,1,0.000004
675,Japanese,Serbian,1,0.000012


now take this linear scale and map it to a logarithmic one

In [352]:
scaled_relative = []
mini = lang_count_df.relative_count.min()
maxi = lang_count_df.relative_count.max()

new_mini = 1.
new_maxi = 5.

for indx, row in lang_count_df.iterrows():
    scaled_relative.append((new_maxi-new_mini) * row["relative_count"] / maxi + new_mini)
lang_count_df["scaled_relative_count"] = scaled_relative

lang_count_df

Unnamed: 0,from_language,to_language,count,relative_count,scaled_relative_count
0,English,English,34503,0.000007,1.000340
1,English,German,8853,0.000019,1.000921
2,English,Swedish,5881,0.000005,1.000260
3,Dutch,English,5034,0.000024,1.001181
4,English,Japanese,4277,0.000027,1.001335
...,...,...,...,...,...
672,Dhivehi,Russian,1,0.000490,1.024183
673,Dhivehi,Italian,1,0.000108,1.005325
674,Danish,Yue,1,0.000004,1.000174
675,Japanese,Serbian,1,0.000012,1.000589


Add number of users who belong to this language

In [353]:
unique_languages = set(lang_count_df["from_language"].to_list() + lang_count_df["to_language"].to_list())
nr_users_language_og = {l: len(homes_B[homes_B.languages == l]) + len(homes_G[homes_G.languages == l]) for l in unique_languages}

nr_users_language = {}

maxi = max(nr_users_language_og.values())

# scale
for k, v in nr_users_language_og.items():
    nr_users_language[k] = (new_maxi-new_mini) * v / maxi + new_mini


nr_users_language

{'Vietnamese': 1.004720563183712,
 'Macedonian': 1.0002052418775527,
 'Luxembourgish': 1.0085791104817028,
 'Tajik': 1.0000410483755104,
 'Bosnian': 1.000451532130616,
 'Danish': 1.0332491841635367,
 'Italian': 1.0633786917882724,
 'Bangla': 1.0002462902530633,
 'Ukrainian': 1.003119676538801,
 'Punjabi': 1.0016419350204215,
 'Seychellois': 1.000082096751021,
 'Finnish': 1.0235617675430495,
 'Afghan': 1.0001231451265316,
 'Hindi': 1.015598382694005,
 'Kyrgyz': 1.0001231451265316,
 'English': 5.0,
 'Slovene': 1.0029965314122693,
 'Asante': 1.0016419350204215,
 'Persian': 1.0009441126367424,
 'Serbian': 1.0023808057796113,
 'Kinyarwanda': 1.0000410483755104,
 'Nepali': 1.0002052418775527,
 'Khmer': 1.0002052418775527,
 'Arabic': 1.1073825503355705,
 'Icelandic': 1.000697822383679,
 'Slovak': 1.00303757978778,
 'Thai': 1.0706442542536379,
 'Hungarian': 1.0105494325062085,
 'Creole': 1.0001641935020422,
 'Setswana': 1.000082096751021,
 'Sinhala': 1.0002873386285738,
 'Malay': 1.00685507871

## Generating a network Graph for Religions

In [367]:
def make_edge(x, y, text, width):
    """
    For making plotly network edges
    """
    return  go.Scatter(x         = x,
                       y         = y,
                       line      = dict(width = width,
                                   color = 'peru'),
                       hoverinfo = 'text',
                       text      = ([text]),
                       mode      = 'lines')

# create a graph
religion_graph = nx.Graph()

# nodes
for r, count in nr_users_religion.items():
    religion_graph.add_node(r, size=count + 2)


# edges
for indx, row in rel_count_df.iterrows():
    religion_graph.add_edge(row["from_religion"], 
                            row["to_religion"], 
                            weight = row["scaled_relative_count"] + 1)

# positions of edges
pos_ = nx.spring_layout(religion_graph)


# for each edge, make an edge_trace, append to list
edge_trace = []
for edge in religion_graph.edges():
    # fill in the properties
    if religion_graph.edges()[edge]['weight'] > 0:
        char_1 = edge[0]
        char_2 = edge[1]
        x0, y0 = pos_[char_1] 
        x1, y1 = pos_[char_2]
 
    char_1, char_2 = sorted([char_1, char_2])

    text   = "<b>" + str(int(rel_count_df[(rel_count_df["from_religion"]==char_1) & (rel_count_df["to_religion"]==char_2)]["count"])) + '</b>'

    trace  = make_edge([x0, x1, None], [y0, y1, None], text, width = 0.25*religion_graph.edges()[edge]['weight']**1.75)
    edge_trace.append(trace)

    
# make a node trace for collecting the nodes
node_trace = go.Scatter(x         = [],
                        y         = [],
                        text      = [],
                        textposition = "top center",
                        textfont_size = 10,
                        #mode      = 'markers+text',
                        mode      = 'markers',
                        hoverinfo = 'text',
                        marker    = dict(color = [],
                                         size  = [],
                                         line  = None))

# for each node, get the position and size and add to the node_trace
for node in religion_graph.nodes():
    x, y = pos_[node]
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])
    node_trace['marker']['color'] += tuple(['peru'])
    node_trace['marker']['size'] += tuple([5*religion_graph.nodes()[node]['size']])
    node_trace['text'] += tuple(['<b>' + node + '</b>' + "<br>" + "# of Practitioners: "+ str(nr_users_religion_og[node])])
    
# customize layout
layout = go.Layout(
    paper_bgcolor='rgba(0,0,0,0)', # transparent background
    plot_bgcolor='rgba(0,0,0,0)', # transparent 2nd background
    xaxis =  {'showgrid': False, 'zeroline': False}, # no gridlines
    yaxis = {'showgrid': False, 'zeroline': False}, # no gridlines
)

# create figure
fig = go.Figure(layout = layout)
for trace in edge_trace:
    # add node trace
    fig.add_trace(trace)
# remove legend
fig.add_trace(node_trace)
# set view
fig.update_layout(showlegend = False, plot_bgcolor="seashell", title="Friendships between religions")# Remove tick labels
fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)# Show figure
fig.show()

# save it
html = plot(fig, auto_open=False, output_type='div')
with open("./friendships_religions.html", 'w') as file:
    file.write(html)

### Generating a network Graph for Languages

In [366]:
# same as before
def make_edge(x, y, text, width):
    return  go.Scatter(x         = x,
                       y         = y,
                       line      = dict(width = width,
                                   color = 'peru'),
                       hoverinfo = 'text',
                       text      = ([text]),
                       mode      = 'lines')

language_graph = nx.Graph()

for r, count in nr_users_language.items():
    language_graph.add_node(r, size=count)

for indx, row in lang_count_df.iterrows():
    language_graph.add_edge(row["from_language"], 
                            row["to_language"], 
                            weight = row["scaled_relative_count"])

pos_ = nx.spring_layout(language_graph)

edge_trace = []
for edge in language_graph.edges():
    
    if language_graph.edges()[edge]['weight'] > 0:
        char_1 = edge[0]
        char_2 = edge[1]
        x0, y0 = pos_[char_1] 
        x1, y1 = pos_[char_2]
 
    char_1, char_2 = sorted([char_1, char_2])

    text   = "<b>" + str(int(lang_count_df[(lang_count_df["from_language"]==char_1) & (lang_count_df["to_language"]==char_2)]["count"])) + '</b>'

    trace  = make_edge([x0, x1, None], [y0, y1, None], text, width = 0.25*language_graph.edges()[edge]['weight']**1.75)
    edge_trace.append(trace)

node_trace = go.Scatter(x         = [],
                        y         = [],
                        text      = [],
                        textposition = "top center",
                        textfont_size = 10,
                        #mode      = 'markers+text',
                        mode      = 'markers',
                        hoverinfo = 'text',
                        marker    = dict(color = [],
                                         size  = [],
                                         line  = None))

for node in language_graph.nodes():
    x, y = pos_[node]
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])
    node_trace['marker']['color'] += tuple(['peru'])
    node_trace['marker']['size'] += tuple([5*language_graph.nodes()[node]['size']])
    node_trace['text'] += tuple(['<b>' + node + '</b>' + "<br>" + "# of Speakers: "+ str(nr_users_language_og[node])])

layout = go.Layout(
    paper_bgcolor='rgba(0,0,0,0)', # transparent background
    plot_bgcolor='rgba(0,0,0,0)', # transparent 2nd background
    xaxis =  {'showgrid': False, 'zeroline': False}, # no gridlines
    yaxis = {'showgrid': False, 'zeroline': False}, # no gridlines
)

fig = go.Figure(layout = layout)
for trace in edge_trace:
    fig.add_trace(trace)
fig.add_trace(node_trace)
fig.update_layout(showlegend = False, plot_bgcolor = "seashell", title="Friendships between languages")# Remove tick labels
fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)
fig.show()

html = plot(fig, auto_open=False, output_type='div')
with open("./friendships_languages.html", 'w') as file:
    file.write(html)