In [157]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools
import networkx as nx
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import math

### load data

In [158]:
homes_B = pd.read_pickle("data/B_homes.pickle")
homes_G = pd.read_pickle("data/G_homes.pickle")
homes_B.columns = ["longitude", "latitude", "country"]
homes_G.columns = ["longitude", "latitude", "country"]
# drop coordinates
homes_B = homes_B.drop(columns=["longitude", "latitude"])
homes_G = homes_G.drop(columns=["longitude", "latitude"])

# edges
names_edges = ["from_id", "to_id"]
edges_B = pd.read_table("Brightkite_edges.txt", names=names_edges).dropna()
edges_G = pd.read_table("Gowalla_edges.txt", names=names_edges).dropna()

# country meta data
countries = pd.read_pickle("data/countries.pkl")

homes_B

Unnamed: 0_level_0,country
user_id,Unnamed: 1_level_1
0,US
1,US
2,US
3,US
4,FI
...,...
58222,LB
58224,LB
58225,LB
58226,LB


### go through the homes and add ones religion and language to it

In [159]:
homes_B = homes_B.reset_index()
homes_G = homes_G.reset_index()


# add languages and religions
homes_B = homes_B.merge(countries[["alpha_2", "languages", "main_religion"]], how="inner", left_on="country", right_on="alpha_2").drop(columns=["alpha_2"], axis=1)
homes_G = homes_G.merge(countries[["alpha_2", "languages", "main_religion"]], how="inner", left_on="country", right_on="alpha_2").drop(columns=["alpha_2"], axis=1)

#homes_B.set_index("user_id", inplace=True)
#homes_G.set_index("user_id", inplace=True)

#homes_B.sort_index(inplace=True)
#homes_G.sort_index(inplace=True)

homes_B

Unnamed: 0,user_id,country,languages,main_religion
0,0,US,English,Christianity
1,1,US,English,Christianity
2,2,US,English,Christianity
3,3,US,English,Christianity
4,5,US,English,Christianity
...,...,...,...,...
50470,58226,LB,Arabic,Islam
50471,58227,LB,Arabic,Islam
50472,52716,SN,French,Islam
50473,53215,MR,Arabic,Islam


### remove bidirectional edges, i.e. remove double edges

In [160]:
edges_B[["from_id", "to_id"]] = np.sort(edges_B[["from_id", "to_id"]].values, axis=1)
edges_B = edges_B.drop_duplicates()
edges_G[["from_id", "to_id"]] = np.sort(edges_G[["from_id", "to_id"]].values, axis=1)
edges_G = edges_G.drop_duplicates()

edges_B

Unnamed: 0,from_id,to_id
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5
...,...,...
428142,58219,58224
428144,58220,58225
428145,58220,58226
428151,58225,58226


Create a dataframe that has one entry for each edge between friends with their metainformation

In [163]:
# add friend ids to home locations
merged_B = pd.merge(homes_B, edges_B, how="inner", left_on="user_id", right_on="from_id").drop(columns=["from_id"], axis=1)
merged_G = pd.merge(homes_G, edges_G, how="inner", left_on="user_id", right_on="from_id").drop(columns=["from_id"], axis=1)

# add friend information
merged_B = pd.merge(merged_B, homes_B, how="inner", left_on="to_id", right_on="user_id").drop(columns=["to_id"], axis=1)
merged_G = pd.merge(merged_G, homes_G, how="inner", left_on="to_id", right_on="user_id").drop(columns=["to_id"], axis=1)

# add new column names
merged_B.columns = ["id", "country", "language", "religion", "friend_id", "friend_country", "friend_language", "friend_religion" ]
merged_G.columns = ["id", "country", "language", "religion", "friend_id", "friend_country", "friend_language", "friend_religion" ]

merged_B

Unnamed: 0,id,country,language,religion,friend_id,friend_country,friend_language,friend_religion
0,0,US,English,Christianity,1,US,English,Christianity
1,0,US,English,Christianity,2,US,English,Christianity
2,0,US,English,Christianity,3,US,English,Christianity
3,1,US,English,Christianity,3,US,English,Christianity
4,0,US,English,Christianity,4,FI,Finnish,Christianity
...,...,...,...,...,...,...,...,...
193126,58220,LB,Arabic,Islam,58225,LB,Arabic,Islam
193127,58220,LB,Arabic,Islam,58226,LB,Arabic,Islam
193128,58225,LB,Arabic,Islam,58226,LB,Arabic,Islam
193129,58225,LB,Arabic,Islam,58227,LB,Arabic,Islam


And now, get only international friends, i.e., where the countries are different.

In [166]:
international_B = merged_B[merged_B.country != merged_B.friend_country]
international_G = merged_G[merged_G.country != merged_G.friend_country]

international_B

Unnamed: 0,id,country,language,religion,friend_id,friend_country,friend_language,friend_religion
4,0,US,English,Christianity,4,FI,Finnish,Christianity
5,1,US,English,Christianity,4,FI,Finnish,Christianity
6,3,US,English,Christianity,4,FI,Finnish,Christianity
9,4,FI,Finnish,Christianity,5,US,English,Christianity
16,4,FI,Finnish,Christianity,7,US,English,Christianity
...,...,...,...,...,...,...,...,...
193087,42108,KI,I-Kiribati,Christianity,55105,IT,Italian,Christianity
193088,42847,GN,French,Islam,55313,PT,Portuguese,Christianity
193089,44750,LT,Lithuanian,Christianity,55790,CY,Greek,Christianity
193107,58152,LB,Arabic,Islam,58205,US,English,Christianity


Append them to one big one

In [167]:
international = international_B.append(international_G)

international

Unnamed: 0,id,country,language,religion,friend_id,friend_country,friend_language,friend_religion
4,0,US,English,Christianity,4,FI,Finnish,Christianity
5,1,US,English,Christianity,4,FI,Finnish,Christianity
6,3,US,English,Christianity,4,FI,Finnish,Christianity
9,4,FI,Finnish,Christianity,5,US,English,Christianity
16,4,FI,Finnish,Christianity,7,US,English,Christianity
...,...,...,...,...,...,...,...,...
452969,42114,LT,Lithuanian,Christianity,135702,RS,Serbian,Roman Catholic
452970,42114,LT,Lithuanian,Christianity,135703,RS,Serbian,Roman Catholic
452971,42114,LT,Lithuanian,Christianity,135704,BA,Bosnian,Islam
452974,85229,SN,French,Islam,146217,FR,French,Christianity


### Religion Information DataFrame

In [181]:
from_list = []
to_list = []
count_list = []

rel_count = international.groupby(['religion', 'friend_religion']).size()
for (form_r, to_r), count in rel_count.iteritems():
    [from_r, to_r] = sorted([form_r, to_r])
    from_list.append(from_r)
    to_list.append(to_r)
    count_list.append(count)
rel_count_df = pd.DataFrame(zip(from_list, to_list, count_list),
                                     columns =['from_religion', "to_religion", "count"]) 

from_list = []
to_list = []
count_list = []
rel_count_df = rel_count_df.groupby(["from_religion", "to_religion"])['count'].sum()
for (form_r, to_r), count in rel_count_df.iteritems():
    [from_r, to_r] = sorted([form_r, to_r])
    from_list.append(from_r)
    to_list.append(to_r)
    count_list.append(count)
rel_count_df = pd.DataFrame(zip(from_list, to_list, count_list),
                                     columns =['from_religion', "to_religion", "count"]).sort_values("count", ascending=False)

rel_count_df = rel_count_df.reset_index(drop = True)

rel_count_df

Unnamed: 0,from_religion,to_religion,count
0,Christianity,Christianity,82186
1,Buddhism,Christianity,9311
2,Christianity,Islam,5892
3,Christianity,Hinduism,1220
4,Buddhist,Christianity,1187
5,Islam,Islam,974
6,Christianity,Roman Catholic,915
7,Buddhism,Buddhism,619
8,Buddhism,Islam,324
9,Buddhism,Buddhist,258


###### calculate the relative amount, by dividing between all possible friendships between two values

In [183]:
relative = []

for indx, row in rel_count_df.iterrows():
    print(indx, row)
    total_possible_B = len(homes_B[homes_B.main_religion == row["from_religion"]]) * len(homes_B[homes_B.main_religion == row["to_religion"]])
    total_possible_B = len(homes_B[homes_B.main_religion == row["from_religion"]]) * len(homes_B[homes_B.main_religion == row["to_religion"]])
    print(total_possible_B)

0 from_religion    Christianity
to_religion      Christianity
count                   82186
Name: 0, dtype: object
2084652964
1 from_religion        Buddhism
to_religion      Christianity
count                    9311
Name: 1, dtype: object
165053670
2 from_religion    Christianity
to_religion             Islam
count                    5892
Name: 2, dtype: object
34471790
3 from_religion    Christianity
to_religion          Hinduism
count                    1220
Name: 3, dtype: object
12145028
4 from_religion        Buddhist
to_religion      Christianity
count                    1187
Name: 4, dtype: object
5615934
5 from_religion    Islam
to_religion      Islam
count              974
Name: 5, dtype: object
570025
6 from_religion      Christianity
to_religion      Roman Catholic
count                       915
Name: 6, dtype: object
2648164
7 from_religion    Buddhism
to_religion      Buddhism
count                 619
Name: 7, dtype: object
13068225
8 from_religion    Buddhism
to_relig

### Language Information DataFrame

In [182]:
from_list = []
to_list = []
count_list = []

lang_count = international.groupby(['language', 'friend_language']).size()
for (form_r, to_r), count in lang_count.iteritems():
    [from_r, to_r] = sorted([form_r, to_r])
    from_list.append(from_r)
    to_list.append(to_r)
    count_list.append(count)
lang_count_df = pd.DataFrame(zip(from_list, to_list, count_list),
                                     columns =['from_language', "to_language", "count"]) 

from_list = []
to_list = []
count_list = []
lang_count_df = lang_count_df.groupby(["from_language", "to_language"])['count'].sum()
for (form_r, to_r), count in lang_count_df.iteritems():
    [from_r, to_r] = sorted([form_r, to_r])
    from_list.append(from_r)
    to_list.append(to_r)
    count_list.append(count)
lang_count_df = pd.DataFrame(zip(from_list, to_list, count_list),
                                     columns =['from_language', "to_language", "count"]).sort_values("count", ascending=False)

lang_count_df = lang_count_df.reset_index(drop = True)

lang_count_df

Unnamed: 0,from_language,to_language,count
0,English,English,34503
1,English,German,8853
2,English,Swedish,5881
3,Dutch,English,5034
4,English,Japanese,4277
...,...,...,...
701,Japanese,Slovene,1
702,Finnish,Mandarin,1
703,Cebuano,Punjabi,1
704,Hebrew,Mandarin,1
