In [30]:
import networkx as nx
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy import stats
import os

def read_nodes(node_file, num_feas):

    '''
    Returns a list of tuples (node name, dict of features for that node)
    '''
    
    featname_file = node_file.replace('.feat', '.featnames')

    with open(node_file) as opened_node:
        node_file_lines = opened_node.readlines()
    
    with open(featname_file) as opened_featname:
        featname_file_lines = opened_featname.readlines()

    node_list = []
    for line in node_file_lines:
        split_line = line.split()
        if len(split_line[1:]) != len(featname_file_lines):
            print('error: features recorded not equal features expected')
        node_name = split_line[0]
        node_feas = -np.ones(num_feas)
        for i in range(len(split_line[1:])):
            split_featname_line = featname_file_lines[i].split()
            fea_index = int(split_featname_line[3].strip())
            node_feas[fea_index] = split_line[i + 1]
        current_node = (node_name, dict(features=node_feas))
        node_list.append(current_node)
            
    return node_list

def read_ego_nodes(node_file, num_feas):

    '''
    Returns a list of tuples (node name=str, dict of features for that node)
    '''
    
    featname_file = node_file.replace('.egofeat', '.featnames')
    
    with open(node_file) as opened_node:
        node_file_lines = opened_node.readlines()
        
    with open(featname_file) as opened_featname:
        featname_file_lines = opened_featname.readlines()

    node_list = []
    for line in node_file_lines:
            split_line = line.split() 
            node_feas = -np.ones(num_feas)
            node_name = ''
            for s in node_file:
                if s.isdigit():
                    node_name += s
            for i in range(len(split_line)):
                split_featname_line = featname_file_lines[i].split()
                fea_index = int(split_featname_line[3])
                node_feas[fea_index] = split_line[i]
            current_node = (node_name, dict(features=node_feas))
            node_list.append(current_node)
            
    return node_list

def read_edges(edge_file):

    '''
    Returns a list of edges [node1=str, node2=str]
    '''
    with open(edge_file) as opened_edge:
        edge_file_lines = opened_edge.readlines()
    
    edge_list = []
    for line in edge_file_lines:
            split_line = line.split()
            edge_list.append(split_line)
            
    return edge_list

def read_featnames(featname_file):
    '''
    Returns a list of feature ID's which are strings
    '''
    with open(featname_file) as opened_featname:
        featname_file_lines = opened_featname.readlines()

    featname_list = []
    for line in featname_file_lines:
            split_line = line.split()  
            featname_list.append(split_line[3].strip())
            
    return featname_list

def get_num_feas(featname_dir):
    full_featname_list = []
    for file in os.listdir(featname_dir):
        if file.endswith('.featnames'):
            featnames = read_featnames(featname_dir+file)
            full_featname_list += featnames
        
    return len(set(full_featname_list))

def create_graph(data_dir, graph_type='normal'):
    '''
    creates a graph from the data files in datadir.
    graph type options are 'normal', 'directed'
    '''
    if graph_type == 'normal':
        graph = nx.Graph()
    elif graph_type == 'directed':
        graph = nx.DiGraph()
    else:
        print('invalid graph type')
        exit()
        
    num_feas = get_num_feas(data_dir)

    for file in os.listdir(data_dir):
        if file.endswith('.feat'):
            nodes = read_nodes(data_dir+file, num_feas)
            graph.add_nodes_from(nodes)
        elif file.endswith('.egofeat'):
            nodes = read_ego_nodes(data_dir+file, num_feas)
            graph.add_nodes_from(nodes)
        elif file.endswith('combined.txt'):
            edges = read_edges(data_dir+file)
            graph.add_edges_from(edges)  
            
    return graph
        
def create_feature_array(graph):
    '''
    returns a feature array (nodes, node_features)
    '''
    num_features = len(list(graph.nodes(data=True))[0][1]['features'])
    num_nodes = graph.number_of_nodes()
    feature_array = np.zeros((num_nodes, num_features))
    for i in range(num_nodes):
        feature_array[i,:] = list(graph.nodes(data=True))[i][1]['features']
        
    return feature_array

def create_feature_array_v2(graph):
    '''
    creates a feature array (nodes, node_features*2) which takes into account edge information by avergaing the
    feature values of all neighbors and concatenating it with the central node feature vector
    '''
    num_features = len(list(graph.nodes(data=True))[0][1]['features'])
    num_nodes = graph.number_of_nodes()
    feature_array = np.zeros((num_nodes, num_features*2))
    
    for i in range(num_nodes):
        center_node_features = list(graph.nodes(data=True))[i][1]['features']
        neighbors = graph[list(graph.nodes(data=True))[i][0]]
        sum_neighbor_features = np.zeros(num_features)
        for neighbor in neighbors:
            sum_neighbor_features += graph.node[neighbor]['features']
        avg_neighbor_features = sum_neighbor_features/len(neighbors.keys())
        feature_array[i,:] = np.concatenate((center_node_features, avg_neighbor_features))
        
    return feature_array

def run_log_reg(feature_array, feature_index, test_split=0.1, regularization='l2'):
    '''
    feature_index = which feature to predict
    test_split = what portion of data to use as a test set
    regularization = l1', 'l2', or 'elasticnet'
    '''
    X = np.delete(feature_array, feature_index, axis=1)
    y = feature_array[:, feature_index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split)
    log_reg = LogisticRegression(penalty = regularization).fit(X_train, y_train)
    ypred = log_reg.predict(X_test)
    pCC = stats.pearsonr(ypred, y_test)[0]
    print('The Pearson correlation is: ', pCC)
    
fb_graph = create_graph('facebook_data/')
print(nx.info(fb_graph))
print(fb_graph['1'])
fb_feature_array = create_feature_array_v2(fb_graph)
run_log_reg(fb_feature_array, 2)

# gplus_graph = create_graph('gplus/', 'directed')
# print(nx.info(gplus_graph))
# gplus_feature_array = create_feature_array(gplus_graph)
# run_log_reg(gplus_feature_array, 2)


Name: 
Type: Graph
Number of nodes: 4039
Number of edges: 88234
Average degree:  43.6910
{'0': {}, '48': {}, '53': {}, '54': {}, '73': {}, '88': {}, '92': {}, '119': {}, '126': {}, '133': {}, '194': {}, '236': {}, '280': {}, '299': {}, '315': {}, '322': {}, '346': {}}




The Pearson correlation is:  0.947631387699498


In [33]:
def get_visibility(graph, feature_index, awards, contendors):
    '''
    Returns the visibility, according to the formula #positive/#observed
    feature_index = the index of the protected feature to check
    awards = a list or array of length num_nodes which has a positive (1) or negative (0) assignment for each node, 
    based on whether or not that node was labeled successful or not
    contendors = a list or array of length num_nodes which has a positive (1) or negative (0) assignment for each node, 
    based on whether or not that node was qualified or not
    '''
    num_positive = 0
    observing_set = set()
    for i in range(graph.number_of_nodes()):
        if (awards[i] == contendors[i]) and (list(graph.nodes(data=True))[i][1]['features'][feature_index] == 1):
            num_positive += 1
            for neighbor in graph[list(graph.nodes(data=True))[i][0]]:
                if graph.node[neighbor]['features'][feature_index] == 1:
                    observing_set.add(neighbor)
                
    return num_positive/(num_positive + len(observing_set))
    
def get_visibility_v2(graph, feature_index, awards, contendors):
    '''
    Returns the visibility, according to the formula #positive/#observed
    feature_index = the index of the protected feature to check
    awards = a list or array of length num_nodes which has a positive (1) or negative (0) assignment for each node
    contendors = a list or array of length num_nodes which has a positive (1) or negative (0) assignment for each node, 
    based on whether or not that node was qualified or not
    '''
    num_false_negative = 0
    observing_set = set()
    for i in range(graph.number_of_nodes()):
        if (awards[i] == 0 and contendors[i] == 1) \
        and (list(graph.nodes(data=True))[i][1]['features'][feature_index] == 1):
            num_false_negative += 1
            for neighbor in graph[list(graph.nodes(data=True))[i][0]]:
                if graph.node[neighbor]['features'][feature_index] == 1:
                    observing_set.add(neighbor)
                
    return num_false_negative/(num_false_negative + len(observing_set))    


[ 0.  0.  0. ... -1. -1. -1.]
0.075
