In [10]:
import networkx as nx
import pandas as pd
import numpy as np
from dtaidistance.dtw import distance_matrix, distance
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
import picos as pic
import itertools as it
import copy
import json

In [11]:
def graph_feat(N,feature,feature_col,file_matrix,file_co2):

    adj = pd.read_csv(file_matrix,index_col=0)
    adj.index = adj.index.map(str)
    
    def build_graph(adj,nn_radi):
        names = adj.index
        G = nx.Graph()
        tmp = np.array(adj)
        tmp = np.argsort(tmp)
        tmp = np.argwhere(tmp < nn_radi)
        for e in tmp:
            G.add_edge(names[e[0]],names[e[1]])
        return(G)

    # Build KNN Graph
    
    G = [build_graph(adj,i) for i in range(1,N)]
    
    feature.index = feature.index.map(str)

    for gg in G:
        attrs={}

        for edge in gg.edges():
            attrs[(edge[0],edge[1])] = {}

            for v in feature.columns[feature_col:]:
                w = np.mean([feature[v][edge[0]] , feature[v][edge[1]]])       
                attrs[(edge[0],edge[1])][v] = w
        
        nx.set_edge_attributes(gg, attrs)
    
    co2 = pd.read_csv(file_co2,index_col=0)
    co2.index = co2.index.map(str)
    
    for gg in G:
        attrs={}

        for edge in gg.edges():
            try:
                b0 = co2['2010'][edge[0]]
                b1 = co2['2010'][edge[1]]

                attrs[(edge[0],edge[1])] = {'co2' : np.mean([b0,b1])}
                                           
            except:
                pass

        nx.set_edge_attributes(gg, attrs)

    
    ### I.Preprocessing Phase: Computations over the Domain

    K = len(G)

    #Amount of features
    F = len(feature.columns[feature_col:])

    def feat_edge(D,x):
        data_2 = [v for _,_,v in D.edges.data(x)]
        data_2_at = [v for v in data_2 if v is not None]
        d2 = np.histogram(data_2_at,bins=5)[0]
        #fixed the number of bins, not using scott anymore
        #d2 = np.histogram(data_2_at,bins=5)[0]
        return(d2/float(sum(d2)))

    ### Step 1: Feature Representation Model

    PDF_e = [[feat_edge(gg,v) for v in feature.columns[feature_col:]] for gg in G]
    
    ### Step 2: Feature Diversity Model

    dists_e = [distance_matrix(PDF_e[i]) for i in range(K)]
    
    #Edge
    SF_e = np.nan_to_num(np.mean(dists_e,axis=0),posinf=0)
    
    
    ### II. Query Phase: Summary Creation

    ### Step 1: Domain-specificity Model
    
    PDF_ge = [feat_edge(gg,'co2') for gg in G]
    
    diff_e = [[distance(PDF_ge[i],PDF_e[i][l]) for l in range(F)] for i in range(K)]

    h_e= np.mean(diff_e,axis=0)
                                                                                 
    ### Step 2: Feature Selection

    #Regularization Parameters l1,l2, l3

    l1 = 1/F
    l2 = 1
    l3 = 1

    #Identity Matrix: np.identity(F)

    Q_e = np.multiply(SF_e,l1) + np.multiply(np.identity(F),l2)
    r_e = np.multiply(h_e,l3)

    sel_feat = 3

    #Solver Edge
    prob_e = pic.Problem()
    K=F

    f = pic.RealVariable('f',K)
    prob_e.add_constraint(pic.sum(f.T)<=F)
    prob_e.add_constraint(pic.sum(f.T)>=sel_feat)
    prob_e.set_objective('min',f.T*Q_e*f+f.T*r_e)
    sol_e = prob_e.solve(solver="ecos",primals=None,duals=None,max_footprints=100,verbosity=0,rel_prim_fsb_tol=1e-3,rel_dual_fsb_tol=1e-3,abs_prim_fsb_tol=1e-3,abs_dual_fsb_tol=1e-3)

    values = np.array(list(sol_e.primals.values())[0])
    
    order = np.argsort(-values)

    result_order = list(zip(feature.columns[feature_col:][order],values[order]))
    
    features_order = list(feature.columns[feature_col:][order])
    
    y = np.round(list(sol_e.primals.values()))
    result = np.where(y == 1)
    
    return(features_order,result_order, result)

In [None]:
#N = number of neighbors - 1 up to 20
N=2
#Municipality Code Column (features are considered from this column)
feature_col = 0
feature = pd.read_csv("../datasets/br_data_census_2010.csv", index_col=feature_col)
features_order,result_order, result = graph_feat(N,feature,feature_col,"../datasets/filtered_adjacency_matrix_inv.csv","../datasets/br_co2_log_2010.csv")


In [None]:
df = pd.DataFrame.from_dict({ 'Order' : result_order
                  }, orient='index').transpose()

In [None]:
df.head(10)

In [None]:
#Number of features determined by the CFS algorithm in https://github.com/tpinhoda/EF-FS_Pipeline
data_json = {'selected_features':features_order[0:265]}

data_json

In [None]:
def escrever_json(lista):
    with open('features_selected_homicides/attributed_eagle_log_co2_n2.json', 'w') as f:
        json.dump(lista, f, indent=4)
    
escrever_json(data_json)