In [137]:
import networkx as nx
import pandas as pd
import numpy as np
import simplejson as json
import sys
import os
from collections import defaultdict

In [148]:
#Loading the json file over here
# {
#   "villages": [1],
#   "nodes" : ["religion"],
#   "edges" : ["borrowmoney"],
#   "type" : "" #HH for households, nothing for individuals
# }

try:
    with open("config.json",'r') as config_file:
        config = json.load(config_file)
except:
    print "Config.json not found.. exitting"
    sys.exit()

print config

{'nodes': ['religion', 'caste', 'mothertongue', 'occupation', 'resp_gend'], 'villages': [1], 'type': u'', 'edges': ['allVillageRelationships']}


In [149]:
#Initializing the variables
networks = []
if config['type']=='HH':
    characterestic_file = 'demographics/household_characteristics.dta'
    prefix = 'HH'
else:
    characterestic_file = 'demographics/individual_characteristics.dta'
    prefix = ''

#load files and check if node heads exist or not
df = pd.read_stata(characterestic_file)
# print df.columns
print df.head()

   village  adjmatrix_key     pid  hhid  resp_id  resp_gend  \
0        1              5  100201  1002        1          1   
1        1              6  100202  1002        2          2   
2        1             23  100601  1006        1          1   
3        1             24  100602  1006        2          2   
4        1             27  100701  1007        1          1   

                   resp_status  age  religion caste        ...         \
0            Head of Household   38  HINDUISM   OBC        ...          
1  Spouse of Head of Household   27  HINDUISM   OBC        ...          
2            Head of Household   29  HINDUISM   OBC        ...          
3  Spouse of Head of Household   24  HINDUISM   OBC        ...          
4            Head of Household   58  HINDUISM   OBC        ...          

        privategovt work_outside work_outside_freq shgparticipate shg_no  \
0  PRIVATE BUSINESS          Yes                 0             No    NaN   
1                            N

In [153]:
#validate node properties
count = 0
for value in config['nodes']:
    if value in df.columns:
        count += 1
try:
    if count != len(config['nodes']):
        raise ValueError('Mismatch in nodes please check those properties')
except:
    sys.exit()
    
#validate edges
#Get possible relationships & village from edges
folder = 'network/Adjacency Matrices'    
files = os.listdir(folder)
possibleEdgeNames = set()
possibleVillages = set()
for file in files:
    split = file.split('_')
    possibleEdgeNames.add(split[1])
    possibleVillages.add(int(split[-1].split('.')[0]))

#validate edges in cofig   
count = 0
for value in config['edges']:
    if value in possibleEdgeNames:
        count += 1
try:
    if count != len(config['edges']):
        raise ValueError('Mismatch in edges please check those properties')
except ValueError as e:
    print e
    sys.exit()
    
    
#validate villages
count = 0
for value in config['villages']:
    if value in possibleVillages:
        count += 1
try:
    if count != len(config['villages']):
        raise ValueError('Mismatch in villages please check those properties')
except ValueError as e:
    print e
    sys.exit()   

In [154]:
#building data frames for networks, this can be passed to a function to make a network
output = 'output\edgeStata'
network_df = defaultdict(dict)
for village in config['villages']:
    #Figure out the keys in the villages
    key_file = 'network/Adjacency Matrix Keys/key'+prefix+'_vilno_'+str(village)+'.csv'
    with open(key_file) as key:
        keys = [x.strip() for x in iter(key)]
        #Figure out the matrix with row and col names
        for edge in config['edges']:
            vill_file = 'network/Adjacency Matrices/adj_'+edge+'_'+prefix+'_vilno_'+str(village)+'.csv'
            vill_file = vill_file.replace('__','_')
            #Build adjacency matrics by graphs in data frames by their names
            network_df[village][edge]  = pd.read_csv(vill_file,names=keys)
            network_df[village] [edge].index = keys
            network_df[village][edge].to_stata(output+"\\"+str(village)+"_"+edge+".dta")

In [55]:
#build tables as asked for Kartik and Mohammed asked
# print df.head(n=5)
# print df['resp_gend'][0]

   village  adjmatrix_key     pid  hhid  resp_id  resp_gend  \
0        1              5  100201  1002        1          1   
1        1              6  100202  1002        2          2   
2        1             23  100601  1006        1          1   
3        1             24  100602  1006        2          2   
4        1             27  100701  1007        1          1   

                   resp_status  age  religion caste        ...         \
0            Head of Household   38  HINDUISM   OBC        ...          
1  Spouse of Head of Household   27  HINDUISM   OBC        ...          
2            Head of Household   29  HINDUISM   OBC        ...          
3  Spouse of Head of Household   24  HINDUISM   OBC        ...          
4            Head of Household   58  HINDUISM   OBC        ...          

        privategovt work_outside work_outside_freq shgparticipate shg_no  \
0  PRIVATE BUSINESS          Yes                 0             No    NaN   
1                            N

In [None]:
from itertools import combinations
network_pairs_df = defaultdict(dict)
for village in config['villages']:
    #create villages subset
    village_subset = df[df['village']==village]
    #get unique hhids or pids depending upon the configuration
    if config['type'] == 'HH':
        unique_id = village_subset['hhid'].unique()
    else:
        unique_id = village_subset['pid'].unique()
    unique_id_pairs = [item for item in combinations(unique_id,2)]
    #create the dataframe for each edge property
    for edge in config['edges']:
        columns = [edge] + config['nodes']
        network_pairs_df[village][edge] = pd.DataFrame(data=0,index=unique_id_pairs,columns=columns)
        #Append rows accordingly by tuples
        for pair in unique_id_pairs:
            #this is taken directly from the df we created earlier
            network_pairs_df[village][edge][edge][pair] = network_df[village][edge].loc[str(pair[0]),str(pair[1])]
            for node in config['nodes']:
                #network_pairs_df[village][edge][node][pair] = 0#(village_subset[node][pair[0]])#, village_subset[node][pair[1]])
                if config['type']=='HH':
                    network_pairs_df[village][edge][node][pair] =  (village_subset[village_subset['hhid']==pair[0]][node].to_dict().values()[0],
                                                                    village_subset[village_subset['hhid']==pair[1]][node].to_dict().values()[0])
                else:
                    p1 = village_subset[village_subset['pid']==pair[0]][node].to_dict().values()[0] #Hindu
                    p2 = village_subset[village_subset['pid']==pair[1]][node].to_dict().values()[0] #Muslim
                    network_pairs_df[village][edge][node][pair] = str(p1)+","+str(p2)
#                     network_pairs_df[village][edge][node][pair] =  (p1,p2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
output = 'output\pair\'
for village,edge in network_pairs_df.items():
    print village
    for k, paired_frame in edge.items():
        paired_frame.to_stata(output+str(village)+"_"+edge+"_.dta")
        paired_frame.to_csv(output+str(village)+"_"+edge+"_.csv")