In [None]:
from utils import textual_analysis
import pandas as pd
import numpy as np

from time import time
import pickle
import scipy

In [None]:
df_file_name = "data/oggetti_aggregati.pickle"

t = time()
df = pd.read_pickle(df_file_name)
total = time() - t

print("Dataset loaded in %0.3f sec" % total)

df.info()

In [None]:
tf = pickle.load(open("data/n6/tf.pickle", "rb"))
m = scipy.sparse.load_npz('data/n6/sparse_matrix_tfidf.npz')

I = np.load("data/n6/normalized_matrix_embedding_52004terms_weigthedTrue.npysimilarity_I_1000_k.npy")
D = np.load("data/n6/normalized_matrix_embedding_52004terms_weigthedTrue.npysimilarity_D_1000_k.npy")

In [None]:
def getMinOrDefault(array, default_value, threshold = 0.6):
    """
    return the index of the smallest element compared to the threshold
    """
    res = D.shape[1]
    try:
        res = np.min(np.where(array < threshold))
    except:
        return res
    return res

In [None]:
t0 = time()
# filter the top k
d = D.shape
min_knn_indices = (getMinOrDefault(row, d) for row in D)
total = time() - t0

print("Min indices done in %0.3f sec" % total)

In [None]:
from neomodel import db
from neomodel import *
db.set_connection('bolt://neo4j:password@localhost:7687')
config.MAX_POOL_SIZE = 100

In [None]:
def create_struttura(id_s, knn, best_terms, series_oggetti):
    struttura = Struttura(
        id_s = id_s,
        codice_fiscale = series_oggetti['codiceFiscaleStruttura'].strip(),
        tipo_struttura = series_oggetti['tipoStruttura'],
        num_outlinks = knn,
        oggetto = series_oggetti['oggetto'],
        relevant_terms = best_terms
    )
    return struttura

In [8]:
class SemanticRel(StructuredRel):
    score = FloatProperty(default=0)
    
class TfIDfRel(StructuredRel):
    score = FloatProperty(default=0)

class Struttura(StructuredNode):
    id_s = IntegerProperty(index=True)
    codice_fiscale = StringProperty(index=True)
    tipo_struttura = StringProperty(index=True)
    num_outlinks = IntegerProperty(index=True)
    oggetto = StringProperty()
    relevant_terms = StringProperty(index=True)
    
    semantic_rel = RelationshipTo('Struttura', 'SEMANTIC', model=SemanticRel)
    tfidf_rel = RelationshipTo('Struttura', 'TF_IDF', model=TfIDfRel)
    

In [10]:
def write_nodes(n):
    try:
        db.begin()
        j = 0
        for i in df.index[:n]:
            tfidf_text = ' '.join(textual_analysis.top_feats_in_doc(m,  tf.get_feature_names(), i ,20))
            data = df.iloc[i]
            knn = next(min_knn_indices)
            struttura = create_struttura(i, knn, tfidf_text, data)
            struttura.save()
            j +=1
            if j % 1000 == 0:
                print('processed {}'.format(i))
                db.commit()
                db.begin()
        db.commit()
    except Exception as e:
        print('got exception ', e)
        db.rollback()

In [11]:
def write_edges(n):
    try:
        counter_src_not_found = 0
        counter_dst_not_found = 0
        j = 0
        for i in df.index[:n]:
            tfidf_text = ' '.join(textual_analysis.top_feats_in_doc(m,  tf.get_feature_names(), i ,20))
            data = df.iloc[i]
            knn = next(min_knn_indices)
            edges = [(i, I[i][o], round(D[i][o],2)) for o in range(knn)]
            for (src, dst, score) in edges:
                src_node = Struttura.nodes.get_or_none(id_s=src)
                dst_node = Struttura.nodes.get_or_none(id_s=dst)
                j +=1
                if src_node is not None and dst_node is not None:
                    src_node.semantic_rel.connect(dst_node, {'score': score})
                elif src_node is None:
                    counter_src_not_found +=1
                else:
                    counter_dst_not_found +=1
                if j % 1000 == 0:
                    print('processed {}'.format(i))
    except Exception as e:
        print('got exception ', e)
        
    print('source node not found {}'.format(counter_src_not_found))
    print('dest node not found {}'.format(counter_dst_not_found))


In [None]:
write_nodes(2000)

In [12]:
write_edges(2000)

processed 0
processed 1
processed 2
processed 4
processed 5
processed 7
processed 9
processed 10
processed 11
processed 12
processed 13
processed 14
processed 16
processed 17
processed 18
processed 19
processed 20
processed 21
got exception  Failed to read from defunct connection Address(host='127.0.0.1', port=7687)


Exception ignored in: 'neo4j.bolt._io.ChunkedInputBuffer.receive'
Traceback (most recent call last):
  File "/home/fabio/miniconda3/envs/dl/lib/python3.6/ssl.py", line 1009, in recv_into
    return self.read(nbytes, buffer)
  File "/home/fabio/miniconda3/envs/dl/lib/python3.6/ssl.py", line 871, in read
    return self._sslobj.read(len, buffer)
  File "/home/fabio/miniconda3/envs/dl/lib/python3.6/ssl.py", line 631, in read
    v = self._sslobj.read(len, buffer)
KeyboardInterrupt: 


In [None]:
Struttura.nodes.get_or_none(id_s=0)