In [1]:
from scipy import stats
import networkx as nx 
import pandas as pd
import numpy as np 
from numpy import inf
import warnings
from collections import Counter
warnings.filterwarnings("ignore")

In [2]:
def normalize_series(series):
    """ Normalize a Panadas Series
    :param series: Series to be normalized
    :return normalized_series: Normalized series 
    """
    hist = Counter(series)
    if 0 in hist:
        del hist[0]
    n = sum(hist.values())
    dist = {degree: freq/float(n) for degree,freq in hist.items()}
    return series.replace(dist)

def test_if__from_same_distribution(data, attribute):
    """ Compares male and female distribution and returns a test value with a sign:
        *** if p-value <=0.001
        **  if p-value <=0.01
        *   if P-value <= 0.05
        !   if p-value > 0.05
    :param data: Pandas dataframe
    :param atribute: Column name for the attribute to be compared
    :return String: test score with significance
    """
    males = data[data["gender"] == "male"]
    females = data[data["gender"] == "female"]
    d, p = stats.ks_2samp(normalize_series(males[attribute]), normalize_series(females[attribute]))
#     print(d,p)
    if p<=0.001:
        temp = '***' 
    elif p<=0.01:
        temp = '**'
    elif p<=0.05:
        temp =  '*'
    else:
        temp = '!'
    return "{}{}".format(round(d,4),temp)

def graph_to_df(net_stats_path, efficiency_path, year, country, globe=True):
    """ Reads pcikeled graph and returns it as a dataframe
    """
    if globe != True:
        G = nx.read_gpickle("{}/{}/{}_12_dir_dir".format(net_stats_path, country,year))
    else:
        G = nx.read_gpickle("{}/{}_12_dir".format(net_stats_path,year))

#     print("{}/{}/{}_12_dir_dir".format(net_stats_path, country,year))
    id_lst = [n for n,d in G.nodes_iter(data=True)]
    prop_lst = [d for n,d in G.nodes_iter(data=True)]
    ids = pd.DataFrame(id_lst)
    props = pd.DataFrame(prop_lst)
    data = ids.join(props).rename(columns={0:'id'})
    eff = pd.read_csv("{}{}_12_eff.csv".format(efficiency_path,year))
    data = data.merge(eff,on="id")
    return data

def normalize(data_frame, columns):
    """
    Normalizing (loged) column values
    :param data_frame: A pandas DataFrame to be processed
    :param columns: List of column names of categorical variables 
    :returns data_frame: DataFrame with normalizedcolumns
    """    
    for col in columns:
        data_frame[col] = log_colum_values(data_frame[col])
    return data_frame

def log_colum_values(series_col):
    """
    Normalizing (loged) column values
    :param series_col: Column of dataframe 
    :returns np array: DataFrame with normalizedcolumns
    """    
    min_val = series_col.min()
    values = [log_value(v, min_val)for v in series_col.values]
    return values

def log_value(x, min_val):
    """
    Logs a value, replacing -inf with 0 and handling negative values
    :param x: value
    :param min_val: min value in list 
    :returns value: loged numerical value
    """    
    if x>0:
        return np.log(x)
    if np.log(x) == -inf:
        return 0 
    if x < 0:
#         print("na")
        return np.log(x + 1 - min_val)
def print_latex(df, col_format="|c|c|c|c|c|"):
    """ Prints the latex syntax equivalent to the passed dataframe
    :param df: Pandas dataframe 
    :col_format : String indicating the format of columns
    """
    df = df.round(2)
    latex = df.to_latex(column_format=col_format, index=False).replace('toprule',
                                            "hline").replace('midrule',
                                            "hline").replace('bottomrule',
                                            "hline").replace("\\\\\n",
                                            "\\\\\n\\hline").replace("\hline\hline","\hline")
    print(latex)

In [3]:
path = "data/filtered_graphs/"
path_eff="gt_efficiency/"
path_pickled_data="data/filtered_graphs/all"
year_lst = [2003,2006,2008,2011,2013,2016]
country_list = ["usa", "de", "fr", "gb", "ru"]

In [4]:
# country_dict = {
#     "USA": "_american",
#     "Germany": "_german",
#     "France": "_french",
#     "GB": "_british",
#     "Russia": "_russian",
# }

In [5]:
# data = graph_to_df("data/filtered_graphs/","gt_efficiency/", 2016, "ru")
# data = normalize(data, ["eig_central","in_degree","k_core","out_degree","efficiency"])

In [6]:
# country_list = ["usa", "de", "fr", "gb", "ru"]

In [7]:
# test_if__from_same_distribution(data, "k_core")
#then we cannot reject the hypothesis that the distributions of the two samples are the same

In [8]:
# test_if__from_same_distribution(data, "in_degree")
#then we cannot reject the hypothesis that the distributions of the two samples are the same

In [9]:
# test_if__from_same_distribution(data, "out_degree")
#then we cannot reject the hypothesis that the distributions of the two samples are the same

In [10]:
# test_if__from_same_distribution(data, "efficiency")
#then we cannot reject the hypothesis that the distributions of the two samples are the same

# D test for each country over time 

In [11]:
def table_for_country(path_graphs,path_eff, country):
    """ Returns DataFrame of test score changes over time
    :param path_graphs: path to the graphs
    :param path_eff: path to efficiency csv
    :param country: "us" - USA, "gb" - Great Britain etc.
    """
    lst = []
    for year in year_lst:
        
        net_stats = ["eig_central","in_degree","k_core","out_degree","efficiency"]
        data = graph_to_df(path_graphs,path_eff, year, country,globe=False)
#         print(data.columns)
        data = normalize(data, net_stats)
        sublst = [year]
        for i in net_stats:
            sublst.append(test_if__from_same_distribution(data,i))
        lst.append(sublst)
    return pd.DataFrame(lst, columns=["year"]+net_stats)
#     return lst

In [12]:
table_for_country(path, path_eff, "ru")

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,nan!,nan!,nan!,nan!,nan!
1,2006,0.6364***,0.3004!,0.2806!,0.2727!,0.3089!
2,2008,0.6667***,0.286*,0.2917*,0.2917*,0.3191*
3,2011,0.6765***,0.1734!,0.3529***,0.2059!,0.4871***
4,2013,0.7105***,0.1316!,0.3158**,0.2105!,0.4671***
5,2016,0.6444***,0.1333!,0.2222*,0.4***,0.4132***


In [13]:
table_for_country(path, path_eff, "de")

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,1.0***,0.2792!,0.3014!,0.2857!,0.2857!
1,2006,0.8684***,0.2105!,0.3158**,0.2632*,0.272**
2,2008,0.7101***,0.2174**,0.2319**,0.2436***,0.2455***
3,2011,0.6991***,0.1681**,0.2212***,0.2124***,0.3531***
4,2013,0.7252***,0.1908***,0.2061***,0.2214***,0.3002***
5,2016,0.7661***,0.117*,0.193***,0.1928***,0.3***


In [14]:
table_for_country(path, path_eff, "us")

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,0.7318***,0.2162!,0.3784***,0.2162!,0.2567*
1,2006,0.7468***,0.1477***,0.2278***,0.2025***,0.1907***
2,2008,0.7784***,0.1504***,0.1998***,0.2137***,0.2663***
3,2011,0.7753***,0.1704***,0.1835***,0.1873***,0.2774***
4,2013,0.7855***,0.1532***,0.1984***,0.2016***,0.2528***
5,2016,0.8128***,0.1522***,0.2329***,0.1774***,0.2435***


In [15]:
table_for_country(path, path_eff, "gb")

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,0.6667!,0.16!,0.6667!,0.3333!,0.2!
1,2006,0.8846***,0.1538!,0.3846**,0.1746!,0.1746!
2,2008,0.8378***,0.1081!,0.2703*,0.1892!,0.284**
3,2011,0.7778***,0.1897!,0.2593**,0.3495***,0.3223***
4,2013,0.7833***,0.2*,0.3***,0.2*,0.3369***
5,2016,0.8333***,0.2424**,0.4242***,0.1667!,0.2904***


In [16]:
table_for_country(path, path_eff, "fr")

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,1.0*,0.5!,0.5!,0.5!,0.5!
1,2006,0.8293***,0.3431***,0.2195*,0.2927**,0.2869**
2,2008,0.9014***,0.2241**,0.2254**,0.3099***,0.2986***
3,2011,0.8034***,0.1911***,0.1795**,0.2137***,0.3047***
4,2013,0.7609***,0.1582**,0.1884***,0.238***,0.3207***
5,2016,0.8084***,0.1856***,0.188***,0.1536**,0.3502***


# D test Global

In [17]:
def table_global(path_graphs,path_eff):
    """ Returns DataFrame of test score changes over time
    :param path_graphs: path to the graphs
    :param path_eff: path to efficiency csv
    :param country: "us" - USA, "gb" - Great Britain etc.
    """
    lst = []
    for year in year_lst:
        
        net_stats = ["eig_central","in_degree","k_core","out_degree","efficiency"]
        data = graph_to_df(path_graphs,path_eff, year, None,globe=True)
        data = normalize(data, net_stats)
        sublst = [year]
        for i in net_stats:
            sublst.append(test_if__from_same_distribution(data,i))
        lst.append(sublst)
    return pd.DataFrame(lst, columns=["year"]+net_stats)

In [18]:
table_global(path_pickled_data, path_eff)

Unnamed: 0,year,eig_central,in_degree,k_core,out_degree,efficiency
0,2003,0.72***,0.176**,0.312***,0.176**,0.1512**
1,2006,0.7192***,0.1585***,0.2462***,0.1939***,0.0668***
2,2008,0.7445***,0.1505***,0.2399***,0.2006***,0.0633***
3,2011,0.7532***,0.1555***,0.2342***,0.1989***,0.0539***
4,2013,0.7571***,0.1611***,0.2367***,0.2033***,0.0533***
5,2016,0.7841***,0.1618***,0.2202***,0.1981***,0.0557***


In [19]:
# If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis:
# that the distributions of the two samples are the same.

In [20]:
print_latex(table_global(path_pickled_data, path_eff))

\begin{tabular}{|c|c|c|c|c|}
\hline
 year & eig\_central &  in\_degree &     k\_core & out\_degree & efficiency \\
\hline
 2003 &     0.72*** &    0.176** &   0.312*** &    0.176** &   0.1512** \\
\hline 2006 &   0.7192*** &  0.1585*** &  0.2462*** &  0.1939*** &  0.0668*** \\
\hline 2008 &   0.7445*** &  0.1505*** &  0.2399*** &  0.2006*** &  0.0633*** \\
\hline 2011 &   0.7532*** &  0.1555*** &  0.2342*** &  0.1989*** &  0.0539*** \\
\hline 2013 &   0.7571*** &  0.1611*** &  0.2367*** &  0.2033*** &  0.0533*** \\
\hline 2016 &   0.7841*** &  0.1618*** &  0.2202*** &  0.1981*** &  0.0557*** \\
\hline
\end{tabular}

