# Creazione grafo per Neo4j

Questo notebook richiede in input i seguenti file (generati dal notebook *6_Graph_layer_generation*):

- "oggetti_aggregati.pickle", file contenente le informazioni da associare ad un nodo;
- Matrici K nearest neighbors *I* e *D* (i.e. normalized_matrix_embedding_21278terms_weigthedTrue.npysimilarity_I_100_k.npy, normalized_matrix_embedding_21278terms_weigthedTrue.npysimilarity_D_100_k.npy)
- 'tf.pickle'
- 'sparse_matrix_tfidf.npz'

Il notebook restituisce due file csv *nodes.csv* e *edges.csv*. Questi csv possono essere direttamente caricati in Neo4j attraverso bulk download.
Di seguito i comandi per il bulk download in Neo4j:

``` bash
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM "file:///nodes.csv" AS csvLine
CREATE (p:Node { id: toInteger(csvLine.id_s), 
type_id: toInteger(csvLine.type_id),
node_type: csvLine.node_type,
fiscal_code: csvLine.fiscal_code,
relevant_terms: csvLine.relevant_terms,
region: csvLine.region,
province: csvLine.province,
city: csvLine.city,
name: csvLine.name,
istat_code: csvLine.istat_code,
administrative_code: csvLine.administrative_code,
company_type: csvLine.company_type,
nation: csvLine.nation})

CREATE CONSTRAINT ON (node:Node) ASSERT node.id IS UNIQUE

##creazione edges
USING PERIODIC COMMIT 500
LOAD CSV WITH HEADERS FROM "file:///edges.csv" AS csvLine
MATCH (node1:Node { id: toInteger(csvLine.src)}), (node2:Node { id: toInteger(csvLine.dst)})
CREATE (node1)-[:SEMANTIC { score: csvLine.score }]->(node2)

[INDEXING]
CREATE INDEX ON :Node(fiscal_code);
CREATE INDEX ON :Node(type_id);
```

**Note**

1. Per lanciare un'istanza di Neo4J esegui il docker-compose in **anacAnalysis/web_api/**
2. Per aggiungere un file in un container esegui il comando ``` docker cp nodes.csv mycontainer:/nodes.csv```

In [3]:
import pandas as pd
import numpy as np

from time import time
from utils import textual_analysis
import pickle
import scipy

In [4]:
base_path = "data/final_data/n6/"

In [5]:
df_file_name = base_path + 'oggetti_aggregati.pickle'

t = time()
df = pd.read_pickle(df_file_name)
total = time() - t

print("Dataset loaded in %0.3f sec" % total)

df.info()

Dataset loaded in 0.084 sec
<class 'pandas.core.frame.DataFrame'>
Int64Index: 61059 entries, 0 to 61058
Data columns (total 15 columns):
codiceFiscaleStruttura     61059 non-null object
oggetto                    61059 non-null object
tipoStruttura              61059 non-null object
#Partita_Iva               61059 non-null object
Ragione_Sociale            61059 non-null object
Forma_Societaria           61053 non-null object
Nazione_Sede_legale        61059 non-null object
Regione_Sede_legale        61052 non-null object
Provincia_Sede_legale      61052 non-null object
Comune_Sede_legale         61052 non-null object
Indirizzo_Sede_legale      61057 non-null object
Numero_Aggiudicazioni      61059 non-null int64
Numero_Abilitazioni        61059 non-null int64
Numero_Transazioni         61059 non-null int64
Numero_Contratti_attivi    61059 non-null int64
dtypes: int64(4), object(11)
memory usage: 7.5+ MB


In [17]:
columns_to_rename = { "Comune_Sede_legale": "comune", 
                     "Provincia_Sede_legale": "provincia",
                     "Regione_Sede_legale" : "regione",
                     "Indirizzo_Sede_legale": "indirizzo",
                     "Ragione_Sociale" : "ragione_sociale",
                     "Forma_Societaria" : "forma_societaria",
                     "Nazione_Sede_legale" : "nazione"
                     
    
}
df.rename(columns= columns_to_rename, inplace = True)

In [6]:
tf = pickle.load(open(base_path + "tf.pickle", "rb"))
m = scipy.sparse.load_npz(base_path + 'sparse_matrix_tfidf.npz')

In [7]:
I = np.load(base_path + "normalized_matrix_embedding_21278terms_weigthedFalse.npysimilarity_I_100_k.npy")
D = np.load(base_path + "normalized_matrix_embedding_21278terms_weigthedFalse.npysimilarity_D_100_k.npy")

In [8]:
def get_min_or_default(array, default_value, threshold = 0.7):
    """
    return the index of the smallest element compared to the threshold
    """
    res = D.shape[1]
    try:
        res = np.min(np.where(array < threshold))
    except:
        return res
    return res


def get_top_scores(Xtr, features, row_id, top_n=25):
    ''' Get top n score terms (with respect to tfidf values) of the input row (e.g.[0.34 , 0.14, 0.06]).''' 
    row = np.squeeze(Xtr[row_id].toarray())
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_score = ["{0:.2f}".format(row[i]) for i in topn_ids if row[i] > 0]
    return top_score

def get_top_terms(Xtr, features, row_id, top_n=25):
    ''' Get top n terms (with respect to tfidf values) of the input row (e.g.[stand , catering, fornitura]).''' 
    row = np.squeeze(Xtr[row_id].toarray())
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_terms = [features[i] for i in topn_ids if row[i] > 0]
    return top_terms

In [9]:
class Node:
    def __init__(self, type_id, node_type, id_s, fiscal_coderelevant_terms, region, province, city, address, name, istat_code, administrative_code, company_type, nation):
        self.type_id = type_id
        self.node_type = node_type
        self.id_s = id_s
        self.fiscal_code = fiscal_code
        self.relevant_terms = relevant_terms.replace(",", "")
        self.region = region.replace(",", "")
        self.province = province.replace(",", "")
        self.city = city.replace(",", "")
        self.address = address.replace(",", "")
        self.name = name.replace(",", "")
        self.istat_code = istat_code.replace(",", "")
        self.administrative_code = administrative_code.replace(",", "")
        self.company_type = company_type.replace(",", "")
        self.nation = nation.replace(",", "")
class Edge:
    def __init__(self, src, dst, score):
        self.src = src
        self.dst = dst
        self.score = score
        
from collections import namedtuple
cols_node = ['type_id', 'node_type', 'id_s', 'fiscal_code', 'relevant_terms', 'region', 'province', 'city', 'address', 'name', 'istat_code', 'administrative_code', 'company_type', 'nation']
Node = namedtuple('Node', cols_node) 

Edge = namedtuple('Edge', ['src', 'dst', 'score']) 
        

In [24]:
def create_struttura(id_s, best_terms, props):
    if props['tipoStruttura'] == 'PA':
        node = Node(
              type_id = 1,
              node_type = 'PA',
              id_s = id_s,
              fiscal_code = props['codiceFiscaleStruttura'].strip(),
              relevant_terms = best_terms,
              region = props['regione'],
              province = props['provincia'],
              city = props['comune'], 
              address = props['indirizzo'],          
              istat_code = props['tipologia_istat'],
              administrative_code = props['tipologia_amm'],
              name = props['name'],
              company_type = props['forma_societaria'],
              nation = props['nazione'] 
        )  
        return node
    
    elif props['tipoStruttura'] == 'AGG':
        node = Node(
              type_id = 2,
              node_type = 'COMPANY',
              id_s = id_s,
              fiscal_code = props['codiceFiscaleStruttura'].strip(),
              relevant_terms =  best_terms,
              name = props['ragione_sociale'],
              company_type = props['forma_societaria'],
              region = props['regione'],
              province = props['provincia'],
              city = props['comune'], 
              address = props['indirizzo'],
              #administrative_code = props['tipologia_amm'],
              #istat_code = props['tipologia_istat'],
              administrative_code = 'Nan',
              istat_code = 'Nan',
              nation = props['nazione'])
        return node
    else:
        print('error structure not recognized {}'.format(props['tipoStruttura']))
        print(id_s, best_terms, props)
        return None
    


In [25]:
#creazione nodi
import csv
def write_nodes(file, n = None):
    if n is None:
        n = len(df)
    try:
        with open(file, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(Node._fields) # we are being naughty here and using a private attribute

            j = 0
            for i in df.index[:n]:

                tfidf_text = ' '.join(textual_analysis.top_feats_in_doc(m,  tf.get_feature_names(), i ,50))
                data = df.loc[i]
                node = create_struttura(i, tfidf_text, data)
                if node is not None:
                    writer.writerow(node)
                j +=1
                   
    except Exception as e:
        print('got exception ', e)

In [26]:
#creazione edges
def write_edges(file, n = None):
    '''
    In Neo4j, ALL relationships are directed. However, you can have the notion of undirected edges at query time.
    For this reason, will be stored only relationships <a,b>, where a.id < b.id
    '''
    min_knn_indices = [get_min_or_default(row, D.shape[1]) for row in D]
    if n is None:
        n = len(df)
    #counter_src_not_found = 0
    #counter_dst_not_found = 0
    try: 
        with open(file, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(Edge._fields) 

            j = 0
            for i in df.index[:n]:
                knn = min_knn_indices[i]
                edges = [ Edge(src = i, dst = I[i][o], score = round(D[i][o],2)) for o in range(knn)]
                for e in edges:
                    
                    if(e.src < e.dst):
                        writer.writerow(e)  
                    
    except Exception as ex:
        print('got exception ', ex)


In [27]:
t = time()
write_nodes(base_path + "node.csv")
total = time() - t
print("Dataset loaded in %0.3f sec" % total)

Dataset loaded in 748.991 sec


In [28]:
t = time()
write_edges(base_path + "edges.csv")
total = time() - t
print("Dataset loaded in %0.3f sec" % total)

Dataset loaded in 34.807 sec
