In [28]:
from scipy import stats
import networkx as nx 
import pandas as pd
import numpy as np 
from numpy import inf
import warnings
from collections import Counter
warnings.filterwarnings("ignore")

In [88]:
def test_if__from_same_distribution(data, attribute):
    """ Compares male and female distribution and returns a test value with a sign:
        *** if p-value <=0.001
        **  if p-value <=0.01
        *   if P-value <= 0.05
        !   if p-value > 0.05
    :param data: Pandas dataframe
    :param atribute: Column name for the attribute to be compared
    :return String: test score with significance
    """
    males = data[data["gender"] == "male"]
    females = data[data["gender"] == "female"]
    d, p = stats.ks_2samp(males[attribute], females[attribute])
#     print(d,p)
    if p<=0.001:
        temp = '***' 
    elif p<=0.01:
        temp = '**'
    elif p<=0.05:
        temp =  '*'
    else:
        temp = '!'
    return "{}{}".format(round(d,4),temp)

def graph_to_df(net_stats_path, efficiency_path, year, country, globe=True):
    """ Reads pcikeled graph and returns it as a dataframe
    """
    if globe != True:
        G = nx.read_gpickle("{}/{}/{}_12_dir_dir".format(net_stats_path, country,year))
    else:
        G = nx.read_gpickle("{}/{}_12_dir".format(net_stats_path,year))

#     print("{}/{}/{}_12_dir_dir".format(net_stats_path, country,year))
    id_lst = [n for n,d in G.nodes_iter(data=True)]
    prop_lst = [d for n,d in G.nodes_iter(data=True)]
    ids = pd.DataFrame(id_lst)
    props = pd.DataFrame(prop_lst)
    data = ids.join(props).rename(columns={0:'id'})
    eff = pd.read_csv("{}{}_12_eff.csv".format(efficiency_path,year))
    data = data.merge(eff,on="id")
    return data

def normalize(data_frame, columns):
    """
    Normalizing (loged) column values
    :param data_frame: A pandas DataFrame to be processed
    :param columns: List of column names of categorical variables 
    :returns data_frame: DataFrame with normalizedcolumns
    """    
    for col in columns:
        data_frame[col] = log_colum_values(data_frame[col])
    return data_frame

def log_colum_values(series_col):
    """
    Normalizing (loged) column values
    :param series_col: Column of dataframe 
    :returns np array: DataFrame with normalizedcolumns
    """    
    min_val = series_col.min()
    values = [log_value(v, min_val)for v in series_col.values]
    return values

def log_value(x, min_val):
    """
    Logs a value, replacing -inf with 0 and handling negative values
    :param x: value
    :param min_val: min value in list 
    :returns value: loged numerical value
    """    
    if x>0:
        return np.log(x)
    if np.log(x) == -inf:
        return 0 
    if x < 0:
#         print("na")
        return np.log(x + 1 - min_val)

In [91]:
path = "data/filtered_graphs/"
path_eff="gt_efficiency/"
path_pickled_data="data/filtered_graphs/all"
year_lst = [2003,2006,2008,2011,2013,2016]
country_list = ["usa", "de", "fr", "gb", "ru"]

In [59]:
# country_dict = {
#     "USA": "_american",
#     "Germany": "_german",
#     "France": "_french",
#     "GB": "_british",
#     "Russia": "_russian",
# }

In [60]:
data = graph_to_df("data/filtered_graphs/","gt_efficiency/", 2016, "ru")
data = normalize(data, ["eig_central","in_degree","k_core","out_degree","efficiency"])

In [61]:
# country_list = ["usa", "de", "fr", "gb", "ru"]

In [62]:
test_if__from_same_distribution(data, "k_core")
#then we cannot reject the hypothesis that the distributions of the two samples are the same

'0.1154!'

In [63]:
test_if__from_same_distribution(data, "in_degree")
#then we cannot reject the hypothesis that the distributions of the two samples are the same

'0.1224!'

In [64]:
test_if__from_same_distribution(data, "out_degree")
#then we cannot reject the hypothesis that the distributions of the two samples are the same

'0.114!'

In [65]:
test_if__from_same_distribution(data, "efficiency")
#then we cannot reject the hypothesis that the distributions of the two samples are the same

'0.0549!'

# D test for each country over time 

In [66]:
def table_for_country(path_graphs,path_eff, country):
    """ Returns DataFrame of test score changes over time
    :param path_graphs: path to the graphs
    :param path_eff: path to efficiency csv
    :param country: "us" - USA, "gb" - Great Britain etc.
    """
    lst = []
    for year in year_lst:
        
        net_stats = ["eig_central","in_degree","k_core","out_degree","efficiency"]
        data = graph_to_df(path_graphs,path_eff, year, country)
        data = normalize(data, net_stats)
        sublst = [year]
        for i in net_stats:
            sublst.append(test_if__from_same_distribution(data,i))
        lst.append(sublst)
    return pd.DataFrame(lst, columns=["year"]+net_stats)
#     return lst

In [67]:
table_for_country(path, path_eff, "ru")

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,nan!,nan!,nan!,nan!,nan!
1,2006,0.227!,0.3106!,0.2902!,0.2942!,0.3089!
2,2008,0.1582!,0.286*,0.206!,0.1641!,0.0653!
3,2011,0.1253!,0.1734!,0.1317!,0.092!,0.1798!
4,2013,0.1183!,0.129!,0.1049!,0.1055!,0.0906!
5,2016,0.1172!,0.1224!,0.1154!,0.114!,0.0549!


In [68]:
table_for_country(path, path_eff, "us")

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,0.1414!,0.1601!,0.1132!,0.0826!,0.0739!
1,2006,0.0414!,0.0468!,0.0537!,0.0373!,0.0521!
2,2008,0.06!,0.0328!,0.0349!,0.0324!,0.0563!
3,2011,0.0213!,0.0315!,0.0239!,0.0256!,0.043!
4,2013,0.0405!,0.0293!,0.0318!,0.0321!,0.0281!
5,2016,0.0335!,0.0538!,0.0358!,0.0448!,0.0281!


In [69]:
table_for_country(path, path_eff, "gb")

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,0.3067!,0.16!,0.2667!,0.2533!,0.2!
1,2006,0.3314*,0.1509!,0.1953!,0.1746!,0.1746!
2,2008,0.1692!,0.0926!,0.1089!,0.1176!,0.0345!
3,2011,0.103!,0.1897!,0.1267!,0.1543!,0.1033!
4,2013,0.1057!,0.1497!,0.162!,0.1372!,0.112!
5,2016,0.0606!,0.154!,0.154!,0.1263!,0.1313!


In [70]:
table_for_country(path, path_eff, "fr")

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,0.54!,0.4!,0.38!,0.24!,0.27!
1,2006,0.18!,0.3431***,0.1961!,0.0797!,0.2122!
2,2008,0.1775*,0.2241**,0.1743*,0.1189!,0.184*
3,2011,0.1334*,0.1911***,0.1034!,0.0695!,0.0821!
4,2013,0.0832!,0.1582**,0.0733!,0.0487!,0.0952!
5,2016,0.1295*,0.1405**,0.0582!,0.0438!,0.0452!


# D test Global

In [92]:
def table_global(path_graphs,path_eff):
    """ Returns DataFrame of test score changes over time
    :param path_graphs: path to the graphs
    :param path_eff: path to efficiency csv
    :param country: "us" - USA, "gb" - Great Britain etc.
    """
    lst = []
    for year in year_lst:
        
        net_stats = ["eig_central","in_degree","k_core","out_degree","efficiency"]
        data = graph_to_df(path_graphs,path_eff, year, None,globe=True)
        data = normalize(data, net_stats)
        sublst = [year]
        for i in net_stats:
            sublst.append(test_if__from_same_distribution(data,i))
        lst.append(sublst)
    return pd.DataFrame(lst, columns=["year"]+net_stats)

In [93]:
table_global(path_pickled_data, path_eff)

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,0.1123!,0.1065!,0.0608!,0.0304!,0.051!
1,2006,0.0247!,0.0831***,0.0616***,0.0411!,0.0668***
2,2008,0.04**,0.0671***,0.0716***,0.0577***,0.0635***
3,2011,0.0354***,0.0599***,0.0572***,0.0468***,0.0539***
4,2013,0.0417***,0.057***,0.0547***,0.0444***,0.0539***
5,2016,0.0399***,0.0527***,0.0407***,0.0309***,0.0405***
