In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from dtaidistance.dtw import distance_matrix, distance
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
import picos as pic
import itertools as it
import copy
import json

In [2]:
#N = number of neighbors - 1 up to 20
#feature = path of feature file
#N=5
#feature = pd.read_csv("data/IBGE/census_2010/RS/aggregated_by_weighting_area/joined/Renda.csv",index_col=0)
def graph_feat(N,feature,feature_col,file_matrix,file_co2):
    # constrói um grafo knn 

    adj = pd.read_csv(file_matrix,index_col=0)
    adj.index = adj.index.map(str)
    
    def build_graph(adj,nn_radi):
        names = adj.index
        G = nx.Graph()
        tmp = np.array(adj)
        tmp = np.argsort(tmp)
        tmp = np.argwhere(tmp < nn_radi)
        for e in tmp:
            G.add_edge(names[e[0]],names[e[1]])
        return(G)

    # grafos de 1 até N vizinhos

    G = [build_graph(adj,i) for i in range(1,N)]
    
    feature.index = feature.index.map(str)

    for gg in G:
        attrs={}

        for edge in gg.edges():
            attrs[(edge[0],edge[1])] = {}

            for v in feature.columns[feature_col:]:

                #se quiser trocar np.abs para calcular a diferença
                #w = np.abs(feature[v][edge[0]] - feature[v][edge[1]]) 
                w = np.mean([feature[v][edge[0]] , feature[v][edge[1]]])       

                attrs[(edge[0],edge[1])][v] = w
        
        nx.set_edge_attributes(gg, attrs)
    
    co2 = pd.read_csv(file_co2,index_col=0)
    co2.index = co2.index.map(str)
    
    for gg in G:
        attrs={}

        for edge in gg.edges():
            try:
                b0 = co2['2010'][edge[0]]
                b1 = co2['2010'][edge[1]]

                attrs[(edge[0],edge[1])] = {'co2' : np.mean([b0,b1])}
                                           
            except:
                pass

        nx.set_edge_attributes(gg, attrs)
    
    ### I.Preprocessing Phase: Computations over the Domain

    K = len(G)

    #Amount of features
    F= len(feature.columns[feature_col:])

    def feat_edge(D,x):
        data_2 = [v for _,_,v in D.edges.data(x)]
        data_2_at = [v for v in data_2 if v is not None]
        d2 = np.histogram(data_2_at,bins=5)[0]
        #fixed the number of bins, not using scott anymore
        #d2 = np.histogram(data_2_at,bins=5)[0]
        return(d2/float(sum(d2)))

    ### Step 1: Feature Representation Model


    #for i in range(K):
        #PDF computed for edges
        #PDF_e[i] = feat_edge((D[i]),'num_critic_for_reviews'),feat_edge((D[i]),'duration'),feat_edge((D[i]),'director_facebook_likes'), feat_edge((D[i]),'actor_3_facebook_likes'), feat_edge((D[i]),'actor_1_facebook_likes'),feat_edge((D[i]),'gross'),feat_edge((D[i]),'num_voted_users'),feat_edge((D[i]),'cast_total_facebook_likes'),feat_edge((D[i]),'facenumber_in_poster'), feat_edge((D[i]),'num_user_for_reviews'), feat_edge((D[i]),'budget'), feat_edge((D[i]),'title_year'),feat_edge((D[i]),'actor_2_facebook_likes'),feat_edge((D[i]),'movie_facebook_likes')

    PDF_e = [[feat_edge(gg,v) for v in feature.columns[feature_col:]] for gg in G]

    #for i in range(K): 
    #    PDF_e.append([])
    #    for j,v in enumerate(alfabetizacao.columns[2:]):
    #        PDF_e[i].append(feat_edge(G[i],v)) 
    
    ### Step 2: Feature Diversity Model

    dists_e = [distance_matrix(PDF_e[i]) for i in range(K)]
    
    #edge
    SF_e = np.nan_to_num(np.mean(dists_e,axis=0),posinf=0)
    
    
    ### II. Query Phase: Summary Creation

    ### Step 1: Domain-specificity Model
    PDF_ge = [feat_edge(gg,'co2') for gg in G]
    
    
    diff_e = [[distance(PDF_ge[i],PDF_e[i][l]) for l in range(F)] for i in range(K)]

    h_e= np.mean(diff_e,axis=0)
                                                                                 
    ### Step 2: Feature Selection

    #Regularization Parameters l1,l2, l3

    l1 = 1/F
    l2 = 1
    l3 = 1

    #Matriz identidade: np.identity(F)

    Q_e = np.multiply(SF_e,l1) + np.multiply(np.identity(F),l2)
    r_e = np.multiply(h_e,l3)

    sel_feat = 3

    #Solver Edge
    prob_e = pic.Problem()
    K=F
    #f = pic.IntegerVariable('f',K)

    f = pic.RealVariable('f',K)
    prob_e.add_constraint(pic.sum(f.T)<=F)
    #prob_e.add_constraint(pic.sum(f.T)<F)
    prob_e.add_constraint(pic.sum(f.T)>=sel_feat)
    #Q_e = pic.new_param('Q',Q_e)
    #r_e = pic.new_param('r',r_e)
    prob_e.set_objective('min',f.T*Q_e*f+f.T*r_e)
    sol_e = prob_e.solve(solver="ecos",primals=None,duals=None,max_footprints=100,verbosity=0,rel_prim_fsb_tol=1e-3,rel_dual_fsb_tol=1e-3,abs_prim_fsb_tol=1e-3,abs_dual_fsb_tol=1e-3)

    values = np.array(list(sol_e.primals.values())[0])
    
    #order = np.argsort(-v)
    order = np.argsort(-values)

    result_order = list(zip(feature.columns[feature_col:][order],values[order]))
    
    features_order = list(feature.columns[feature_col:][order])
    
    y = np.round(list(sol_e.primals.values()))
    result = np.where(y == 1)
    
    
    return(features_order,result_order, result)



In [3]:
#Numero de vizinhos
N=5
#Coluna com o Codigo do municipio (as features são consideradas a partir desta coluna)
feature_col = 20
feature = pd.read_csv("data/Brasil_new/data_noelections.csv", index_col=feature_col)
features_order,result_order, result = graph_feat(N,feature,feature_col,"data/Brasil_new/filtered_adjacency_matrix_inv.csv","data/co2/co2_log.csv")


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df = pd.DataFrame.from_dict({ 'Ordem' : result_order
                  }, orient='index').transpose()

In [5]:
df.head(10)

Unnamed: 0,Ordem
0,"(CENSUS_Responsavel02_V034, 0.3020370510890764)"
1,"(CENSUS_Pessoa02_V176, 0.2938343042910904)"
2,"(CENSUS_Pessoa12_V063, 0.2937783713427491)"
3,"(CENSUS_Domicilio01_V077, 0.2937445520686798)"
4,"(CENSUS_Pessoa02_V091, 0.29294153101770387)"
5,"(CENSUS_Responsavel02_V035, 0.29249060166421215)"
6,"(CENSUS_Pessoa02_V177, 0.2907429919099027)"
7,"(CENSUS_Responsavel02_V023, 0.2907000602037412)"
8,"(CENSUS_Pessoa01_V015, 0.29012059373236626)"
9,"(CENSUS_Pessoa02_V196, 0.2883850961166905)"


In [6]:
#features_order[1:157]

#data_json = {'selected_features':features_order[1:11]}

data_json = {'selected_features':features_order[0:265]}

data_json

{'selected_features': ['CENSUS_Responsavel02_V034',
  'CENSUS_Pessoa02_V176',
  'CENSUS_Pessoa12_V063',
  'CENSUS_Domicilio01_V077',
  'CENSUS_Pessoa02_V091',
  'CENSUS_Responsavel02_V035',
  'CENSUS_Pessoa02_V177',
  'CENSUS_Responsavel02_V023',
  'CENSUS_Pessoa01_V015',
  'CENSUS_Pessoa02_V196',
  'CENSUS_Pessoa12_V057',
  'CENSUS_Domicilio01_V064',
  'CENSUS_Pessoa08_V112',
  'CENSUS_Pessoa13_V058',
  'CENSUS_Pessoa11_V048',
  'CENSUS_Pessoa12_V065',
  'CENSUS_Pessoa12_V054',
  'CENSUS_PessoaRenda_V065',
  'CENSUS_Responsavel02_V026',
  'CENSUS_PessoaRenda_V131',
  'CENSUS_Pessoa12_V072',
  'CENSUS_Pessoa08_V116',
  'CENSUS_Pessoa13_V055',
  'CENSUS_Pessoa01_V009',
  'CENSUS_Pessoa08_V118',
  'CENSUS_Pessoa11_V047',
  'CENSUS_Pessoa08_V120',
  'CENSUS_Pessoa02_V185',
  'CENSUS_Pessoa07_V071',
  'CENSUS_Pessoa12_V046',
  'CENSUS_Pessoa02_V190',
  'CENSUS_Pessoa02_V096',
  'CENSUS_Pessoa01_V016',
  'CENSUS_Pessoa13_V057',
  'CENSUS_Pessoa02_V179',
  'CENSUS_Pessoa06_V019',
  'CENSUS_P

In [8]:
def escrever_json(lista):
    with open('resultados/attributed_eagle_log_co2_n5.json', 'w') as f:
        json.dump(lista, f, indent=4)
    
escrever_json(data_json)

In [None]:
data_json_full = {'selected_features':features_order[0:]}

def escrever_json_full(lista):
    with open('resultados_full/GRAPH_ELECTIONS_CENSUS_ALL1.json', 'w') as f:
        json.dump(lista, f, indent=4)
    
escrever_json_full(data_json_full)

In [None]:
data_json_full