In [1]:
import pandas as pd
import numpy as np

from time import time
from utils import textual_analysis
import pickle
import scipy

In [2]:
base_path = 'data/sample_dataset/'

In [3]:
df_file_name = base_path + 'oggetti_aggregati_arricchiti.pickle'

t = time()
df = pd.read_pickle(df_file_name)
total = time() - t

print("Dataset loaded in %0.3f sec" % total)

df.info()

Dataset loaded in 0.341 sec
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68642 entries, 0 to 68641
Data columns (total 16 columns):
level_0                   68642 non-null int64
cod_amm                   11141 non-null object
codiceFiscaleStruttura    68642 non-null object
comune                    68637 non-null object
forma_societaria          57492 non-null object
index                     68642 non-null int64
indirizzo                 68640 non-null object
name                      11141 non-null object
nazione                   57501 non-null object
oggetto                   68642 non-null object
provincia                 68307 non-null object
ragione_sociale           57501 non-null object
regione                   68637 non-null object
tipoStruttura             68642 non-null object
tipologia_amm             11141 non-null object
tipologia_istat           11141 non-null object
dtypes: int64(2), object(14)
memory usage: 8.4+ MB


In [4]:
df.columns

Index(['level_0', 'cod_amm', 'codiceFiscaleStruttura', 'comune',
       'forma_societaria', 'index', 'indirizzo', 'name', 'nazione', 'oggetto',
       'provincia', 'ragione_sociale', 'regione', 'tipoStruttura',
       'tipologia_amm', 'tipologia_istat'],
      dtype='object')

In [5]:
tf = pickle.load(open(base_path + "tf.pickle", "rb"))
m = scipy.sparse.load_npz(base_path + 'sparse_matrix_tfidf.npz')

In [6]:
I = np.load(base_path + "normalized_matrix_embedding_40096terms_weigthedTrue.npysimilarity_I_100_k.npy")
D = np.load(base_path + "normalized_matrix_embedding_40096terms_weigthedTrue.npysimilarity_D_100_k.npy")

In [60]:
from neomodel import db
from neomodel import *
db.set_connection('bolt://neo4j:password@localhost:7687')
config.MAX_POOL_SIZE = 100

In [8]:
def get_min_or_default(array, default_value, threshold = 0.7):
    """
    return the index of the smallest element compared to the threshold
    """
    res = D.shape[1]
    try:
        res = np.min(np.where(array < threshold))
    except:
        return res
    return res


def get_top_scores(Xtr, features, row_id, top_n=25):
    ''' Get top n score terms (with respect to tfidf values) of the input row (e.g.[0.34 , 0.14, 0.06]).''' 
    row = np.squeeze(Xtr[row_id].toarray())
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_score = ["{0:.2f}".format(row[i]) for i in topn_ids if row[i] > 0]
    return top_score

def get_top_terms(Xtr, features, row_id, top_n=25):
    ''' Get top n terms (with respect to tfidf values) of the input row (e.g.[stand , catering, fornitura]).''' 
    row = np.squeeze(Xtr[row_id].toarray())
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_terms = [features[i] for i in topn_ids if row[i] > 0]
    return top_terms

In [9]:
class SemanticRel(StructuredRel):
    score = FloatProperty(default=0)
    
class TfIDfRel(StructuredRel):
    score = FloatProperty(default=0)

class Node(StructuredNode):
    type_id = IntegerProperty()
    node_type = StringProperty(index=True)
    
    id_s = IntegerProperty(index=True)
    fiscal_code = StringProperty(index=True)
    relevant_terms = StringProperty(index=True)
    region = StringProperty(index=True)
    province = StringProperty(index=True)
    city = StringProperty(index=True) 
    address = StringProperty(index=True)
    name = StringProperty(index=True, default='')
    
    istat_code = StringProperty(index=True, default='')
    administrative_code = StringProperty(index=True, default='')
    
    company_type = StringProperty(index=True, default='')
    nation = StringProperty(index=True, default='')
    
    semantic_connection = RelationshipTo('Node','semantic_connected', model=SemanticRel)

In [10]:
def create_struttura(id_s, best_terms, props):
    if props['tipoStruttura'] == 'PA':
        node = Node(
              type_id = 1,
              node_type = 'PA',
              id_s = id_s,
              fiscal_code = props['codiceFiscaleStruttura'].strip(),
              relevant_terms = best_terms,
              region = props['regione'],
              province = props['provincia'],
              city = props['comune'], 
              address = props['indirizzo'],          
              istat_code = props['tipologia_istat'],
              administrative_code = props['tipologia_amm'],
              name = props['name'])  
        return node
    
    elif props['tipoStruttura'] == 'AGG':
        node = Node(
              type_id = 2,
              node_type = 'COMPANY',
              id_s = id_s,
              fiscal_code = props['codiceFiscaleStruttura'].strip(),
              relevant_terms =  best_terms,
              name = props['ragione_sociale'],
              company_type = props['forma_societaria'],
              region = props['regione'],
              province = props['provincia'],
              city = props['comune'], 
              address = props['indirizzo'])
        return node
    else:
        print('error structure not recognized {}'.format(props['tipoStruttura']))
        return None

In [11]:
#creazione nodi
def write_nodes(n = None):
    if n is None:
        n = len(df)
    try:
        db.begin()
        j = 0
        for i in df.index[:n]:
            tfidf_text = ' '.join(textual_analysis.top_feats_in_doc(m,  tf.get_feature_names(), i ,50))
            data = df.loc[i]
            node = create_struttura(i, tfidf_text, data)
            if node is not None:
                node.save()
            j +=1
            if j % 1000 == 0:
                print('processed {}'.format(i))
                db.commit()
                db.begin()
        db.commit()
    except Exception as e:
        print('got exception ', e)
        db.rollback()

In [54]:
#creazione edges
def write_edges(n = None):
    db.begin()
    starting = 782
    min_knn_indices = [get_min_or_default(row, D.shape[1]) for row in D]
    if n is None:
        n = len(df)
    counter_src_not_found = 0
    counter_dst_not_found = 0
    j = 0
    for i in df.index[starting:n]:
        knn = min_knn_indices[i]
        edges = [(i, I[i][o], round(D[i][o],2)) for o in range(knn)]
        for (src, dst, score) in edges:
            
            src_node = Node.nodes.get_or_none(id_s=i)
            #rimuove archi incidenti sullo stesso nodo
            if(src == dst):
                continue
            
            dst_node = Node.nodes.get_or_none(id_s=dst)

            j +=1
            if src_node is not None and dst_node is not None:
                src_node.semantic_connection.connect(dst_node, {'score': score})


            elif src_node is None:
                counter_src_not_found +=1
            else:
                counter_dst_not_found +=1
            if j % 1000 == 0:
                print('processed {}'.format(j))
        if i % 10:
            db.commit()
            db.begin()    
    print('source node not found {}'.format(counter_src_not_found))
    print('dest node not found {}'.format(counter_dst_not_found))


In [67]:
i = 790
min_knn_indices = [get_min_or_default(row, D.shape[1]) for row in D]

knn = min_knn_indices[i]
edges = [(i, I[i][o], round(D[i][o],2)) for o in range(knn)]
for (src, dst, score) in edges[:5]:
    dst_node = Node.nodes.get_or_none(id_s=dst)
    src_node = Node.nodes.get_or_none(id_s=i)
    print(src_node.semantic_connection.is_connected(dst_node))

False
False
False
False
False


In [61]:
import gc
gc.collect()

1992

In [14]:
#n = 2
#for i in df.index[60000:60002]:
#    tfidf_text = ' '.join(textual_analysis.top_feats_in_doc(m,  tf.get_feature_names(), i ,50))
#    data = df.loc[i]
#    n = create_struttura(i, tfidf_text, data)
#    print(n)
    

In [15]:
#n = 2
#min_knn_indices = [get_min_or_default(row, D.shape[1]) for row in D]
#for i in df.index[60000:60002]:
#        knn = min_knn_indices[i]
#        print(knn)
#        edges = [(i, I[i][o], round(D[i][o],2)) for o in range(knn)]
#        for (src, dst, score) in edges[:5]:
#            #if(src == dst):
#                #continue
#            print(src, dst, score)

In [52]:
write_nodes()

In [55]:
write_edges()

processed 1000
processed 2000
processed 3000
processed 4000
processed 5000
processed 6000
processed 7000
processed 8000
processed 9000
processed 10000
processed 11000
processed 12000
processed 13000
processed 14000
processed 15000
processed 16000


Exception ignored in: 'neo4j.bolt._io.ChunkedInputBuffer.receive'
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.6/ssl.py", line 1009, in recv_into
    return self.read(nbytes, buffer)
  File "/home/ubuntu/anaconda3/lib/python3.6/ssl.py", line 871, in read
    return self._sslobj.read(len, buffer)
  File "/home/ubuntu/anaconda3/lib/python3.6/ssl.py", line 631, in read
    v = self._sslobj.read(len, buffer)
KeyboardInterrupt: 


ServiceUnavailable: Failed to read from defunct connection Address(host='127.0.0.1', port=7687)

In [None]:
#r = PublicAdministration.nodes.get_or_none(id_s=0)

In [None]:
#type(r)

In [None]:
#r = SemanticRel()