In [33]:
import networkx as nx
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy import stats
from collections import OrderedDict 
from operator import itemgetter
import os

def read_nodes(node_file, num_feas):

    '''
    Returns a list of tuples (node name=str, node_feas=np array of features)
    '''
    
    featname_file = node_file.replace('.feat', '.featnames')

    with open(node_file) as opened_node:
        node_file_lines = opened_node.readlines()
    
    with open(featname_file) as opened_featname:
        featname_file_lines = opened_featname.readlines()

    node_list = []
    for line in node_file_lines:
        split_line = line.split()
        if len(split_line[1:]) != len(featname_file_lines):
            print('error: features recorded not equal features expected')
        node_name = int(split_line[0])
        node_feas = -np.ones(num_feas)
        for i in range(len(split_line[1:])):
            split_featname_line = featname_file_lines[i].split()
            fea_index = int(split_featname_line[3].strip())
            node_feas[fea_index] = split_line[i + 1]
        current_node = (node_name, node_feas)
        node_list.append(current_node)
            
    return node_list

def read_ego_nodes(node_file, num_feas):

    '''
    Returns a list of tuples (node name=str, node_feas=np array of features)
    '''
    
    featname_file = node_file.replace('.egofeat', '.featnames')
    
    with open(node_file) as opened_node:
        node_file_lines = opened_node.readlines()
        
    with open(featname_file) as opened_featname:
        featname_file_lines = opened_featname.readlines()

    node_list = []
    for line in node_file_lines:
            split_line = line.split() 
            node_feas = -np.ones(num_feas)
            node_name = ''
            for s in node_file:
                if s.isdigit():
                    node_name += s
            node_name = int(node_name)
            for i in range(len(split_line)):
                split_featname_line = featname_file_lines[i].split()
                fea_index = int(split_featname_line[3])
                node_feas[fea_index] = split_line[i]
            current_node = (node_name, node_feas)
            node_list.append(current_node)
            
    return node_list

def read_edges(edge_file):

    '''
    Returns a list of edges (node1=int, node2=int)
    '''
    with open(edge_file) as opened_edge:
        edge_file_lines = opened_edge.readlines()
    
    edge_list = []
    for line in edge_file_lines:
            split_line = line.split()
            edge_list.append((int(split_line[0]), int(split_line[1])))
            
    return edge_list

def read_featnames(featname_file):
    '''
    Returns a list of feature ID's which are strings
    '''
    with open(featname_file) as opened_featname:
        featname_file_lines = opened_featname.readlines()

    featname_list = []
    for line in featname_file_lines:
            split_line = line.split()  
            featname_list.append(split_line[3].strip())
            
    return featname_list

def get_num_feas(featname_dir):
    full_featname_list = []
    for file in os.listdir(featname_dir):
        if file.endswith('.featnames'):
            featnames = read_featnames(featname_dir+file)
            full_featname_list += featnames
        
    return len(set(full_featname_list))

def create_data_files(data_dir):
    '''
    creates a feature array and edge array from the data files in datadir.
    graph type options are 'normal', 'directed'
    '''
        
    num_feas = get_num_feas(data_dir)
    nodes_list = []
    edges_list = []
    
    for file in os.listdir(data_dir):
        if file.endswith('.feat'):
            nodes = read_nodes(data_dir+file, num_feas)
            nodes_list += nodes
        elif file.endswith('.egofeat'):
            nodes = read_ego_nodes(data_dir+file, num_feas)
            nodes_list += nodes
        elif file.endswith('combined.txt'):
            edges = read_edges(data_dir+file)
            edges_list += edges 
            
    sorted_list = sorted(nodes_list, key=itemgetter(0))
    
    '''
    Testing if duplicate id feature vectors are identical
    '''
#     prev_id = None
#     prev_features = None
#     for item in sorted_list:
#         if prev_id == item[0]:
#             if np.array_equal(prev_features, item[1]) == False:
#                 print('error')
# #                 print(prev_features)
# #                 print(item[1])
#             if np.array_equal(prev_features, item[1]):
#                 print('hooray')
#         prev_id = item[0]
#         prev_features = item[1]
    
    nodes_list = sorted(OrderedDict(nodes_list).items(), key=itemgetter(0))
    edges_list = sorted(edges_list, key=itemgetter(0))
    num_samples = len(nodes_list)
    feature_array = np.zeros((num_samples + 1, num_feas + 1))
    feature_array[0, 1:] = range(num_feas)
    feature_array[1:, 1:] = np.vstack(np.array(nodes_list)[:, 1])
    feature_array[1:, 0] = np.array(nodes_list)[:, 0]
    np.savetxt('node_features.csv', feature_array, fmt='%i', delimiter=",")
    np.savetxt('graph.txt', np.array(edges_list), fmt='%i', delimiter=' ')
    
create_data_files('facebook_data/')
