In [5]:
import networkx as nx 
import numpy as np
import pandas as pd 
import community 
from itertools import compress
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import fcluster
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt 
import seaborn as sns
from seaborn import color_palette, set_style, palplot
plt.style.use('ggplot')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Export 2009 and 2015 data for gephi

#### 1. First, define some functions

In [6]:
def preprocess(df):
    keep_var = ['countrycode','counterpart_code','country','counterpart','year','CDIS_IAD','CPIS_IAP','loans_dep']
    df = df[keep_var]                           ## keep only used variables 
    df = df.replace(np.nan,0)                   ## turn na to zero 
    num = df._get_numeric_data()
    num[num < 0] = 0                            ## turn negative to zero 
    df['total'] = df[['CDIS_IAD','CPIS_IAP','loans_dep']].sum(axis=1)

    mata = ['countrycode','counterpart_code','country','counterpart','year']
    var_org = ['CDIS_IAD','CPIS_IAP','loans_dep','total']
    var_sum_out = ['CDIS_Sum_out','CPIS_Sum_out','loans_dep_Sum_out','total_Sum_out']
    var_sum_in = ['CDIS_Sum_in','CPIS_Sum_in','loans_dep_Sum_in','total_Sum_in']
    var_weight = ['CDIS_weight','CPIS_weight','loans_dep_weight','total_weight']

    df[var_sum_out]= df.groupby(['countrycode','year'])[var_org].transform(sum)           ## like stata egen sum 
    df[var_sum_in]= df.groupby(['counterpart_code','year'])[var_org].transform(sum)        ## like stata egen sum 
    df_weight = pd.DataFrame((df[var_org].values / df[var_sum_out].values)*100,columns=[var_weight])
    df[var_weight] = df_weight                                                        ## create the weight variables 
    mata.extend(var_weight)
    df = df[mata]
    df.fillna(0,inplace=True)
    
    return df 

def export_gephi(df,year,var):
    ## clean the data first 
    df_y = df[df['year']==year]
    df_y.fillna(0,inplace=True)
    df_y = df_y[df_y[var]>0]
    G = nx.from_pandas_dataframe(df_y, source="country", target="counterpart", edge_attr=[var],create_using=nx.DiGraph())
    get_hierarchy_cluster(G,var)                                   ## add hierarchy_cluster to node attribute
    get_nx_community(G,var)                                        ## add nx community detection to node attribute
    get_eigen_centrality(G,var)
    nx.write_gexf(G, "../result/gexf/"+var+str(year)+".gexf")
    return G

def get_hierarchy_cluster(G,var):
    node_list = G.nodes()
    node_list.sort()
    A = nx.to_numpy_matrix(G = G,nodelist=node_list,weight=var)
    M = pdist(A, 'cosine')                          # it will return a vector, this is using cosine distance
    M = squareform(M)    
    NaNs = np.isnan(M)
    M[NaNs] = 1   
    Z = hierarchy.average(M)
    k=6
    clusters = fcluster(Z, k, criterion='maxclust')                          ## one kind of auto cluster selection 
    hierarchy_cluster = {k: int(v)-1 for k, v in zip(node_list, clusters)}   ## make it start from 0 
    nx.set_node_attributes(G, 'hierarchy_cluster', hierarchy_cluster)

def get_nx_community(G,var):
#algorism: https://sites.google.com/site/findcommunities/
#package: http://perso.crans.org/aynaud/communities/
    
    ## use adj matrix + its invert, so the edge will be the sum of in and out edge weight 
    node_list = G.nodes()
    node_list.sort()
    A = nx.to_numpy_matrix(G = G,nodelist=node_list,weight=var)
    ud_M = A + A.T 
    ud_G = nx.from_numpy_matrix(ud_M)
    ## relable node to country name 
    maplist = dict(zip(ud_G.nodes(), node_list))
    ud_G = nx.relabel_nodes(ud_G,maplist) 
    l_community = community.best_partition(ud_G,weight='weight',resolution=1)
    nx.set_node_attributes(G, 'nx_community', l_community)
    
def get_eigen_centrality(G,var):
        ## eigenvector centrality
    e = nx.eigenvector_centrality_numpy(G,weight=var)
    nx.set_node_attributes(G, 'eigenvector_centrality', e) 
    

#### 2. read stata data and export it 

In [7]:
## read pre-processed data from stata
df = pd.read_stata('../data/0_CPIS_CDIS_BIS_USTIC_merged_fixed1.dta')
df = preprocess(df)

In [8]:
## export all files to gephi
files = [(2009,'total_weight'),(2009,'loans_dep_weight'),(2009,'CDIS_weight'),(2009,'CPIS_weight'),
         (2015,'total_weight'),(2015,'loans_dep_weight'),(2015,'CDIS_weight'),(2015,'CPIS_weight')]
for x in files:
    year,var = x 
    G = export_gephi(df,year,var)