In [None]:
from utils import textual_analysis
import pandas as pd
import numpy as np

from time import time
import pickle
import scipy

In [None]:
base_path = '../web_api/data/'

In [None]:
df_file_name = base_path + '/oggetti_aggregati_arricchiti.pickle'

t = time()
df = pd.read_pickle(df_file_name)
total = time() - t

print("Dataset loaded in %0.3f sec" % total)

df.info()

In [None]:
tf = pickle.load(open(base_path + "tf.pickle", "rb"))
m = scipy.sparse.load_npz(base_path + 'sparse_matrix_tfidf.npz')

In [None]:
I = np.load("../web_api/data/normalized_matrix_embedding_49312terms_weigthedTrue.npysimilarity_I_1000_k.npy")
D = np.load("../web_api/data/normalized_matrix_embedding_49312terms_weigthedTrue.npysimilarity_D_1000_k.npy")

In [None]:
def getMinOrDefault(array, default_value, threshold = 0.6):
    """
    return the index of the smallest element compared to the threshold
    """
    res = D.shape[1]
    try:
        res = np.min(np.where(array < threshold))
    except:
        return res
    return res

In [None]:
t0 = time()
# filter the top k
d = D.shape
# override d
d = 10
min_knn_indices = (getMinOrDefault(row, d) for row in D)
total = time() - t0

print("Min indices done in %0.3f sec" % total)

In [None]:
from neomodel import db
from neomodel import *
db.set_connection('bolt://neo4j:password@localhost:7687')
config.MAX_POOL_SIZE = 100

In [None]:
def get_min_or_default(array, default_value, threshold = 0.7):
    """
    return the index of the smallest element compared to the threshold
    """
    res = D.shape[1]
    try:
        res = np.min(np.where(array < threshold))
    except:
        return res
    return res


def get_top_scores(Xtr, features, row_id, top_n=25):
    ''' Get top n score terms (with respect to tfidf values) of the input row (e.g.[0.34 , 0.14, 0.06]).''' 
    row = np.squeeze(Xtr[row_id].toarray())
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_score = ["{0:.2f}".format(row[i]) for i in topn_ids if row[i] > 0]
    return top_score

def get_top_terms(Xtr, features, row_id, top_n=25):
    ''' Get top n terms (with respect to tfidf values) of the input row (e.g.[stand , catering, fornitura]).''' 
    row = np.squeeze(Xtr[row_id].toarray())
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_terms = [features[i] for i in topn_ids if row[i] > 0]
    return top_terms

In [None]:
class SemanticRel(StructuredRel):
    score = FloatProperty(default=0)
    
class TfIDfRel(StructuredRel):
    score = FloatProperty(default=0)

class Node(StructuredNode):
    type_id = IntegerProperty()
    node_type = StringProperty(index=True)
    
    id_s = IntegerProperty(index=True)
    fiscal_code = StringProperty(index=True)
    relevant_terms = StringProperty(index=True)
    region = StringProperty(index=True)
    province = StringProperty(index=True)
    city = StringProperty(index=True) 
    address = StringProperty(index=True)
    
    istat_code = StringProperty(index=True, default='')
    administrative_code = StringProperty(index=True, default='')
    
    company_name = StringProperty(index=True, default='')
    company_type = StringProperty(index=True, default='')
    nation = StringProperty(index=True, default='')
    
    semantic_connection = RelationshipTo('Node','semantic_connected', model=SemanticRel)

In [None]:
def create_struttura(id_s, best_terms, props):
    if props['tipoStruttura'] == 'PA':
        node = Node(
              type_id = 1,
              node_type = 'PA',
              id_s = id_s,
              fiscal_code = props['codiceFiscaleStruttura'].strip(),
              relevant_terms = best_terms,
              region = props['regione'],
              province = props['provincia'],
              city = props['comune'], 
              address = props['indirizzo'],          
              istat_code = props['tipologia_istat'],
              administrative_code = props['tipologia_amm'])
        return node
    
    elif props['tipoStruttura'] == 'AGG':
        node = Node(
              type_id = 2,
              node_type = 'COMPANY',
              id_s = id_s,
              fiscal_code = props['codiceFiscaleStruttura'].strip(),
              relevant_terms =  best_terms,
              company_name = props['ragione_sociale'],
              company_type = props['forma_societaria'],
              region = props['regione'],
              province = props['provincia'],
              city = props['comune'], 
              address = props['indirizzo'])
        return node
    else:
        print('error structure not recognized {}'.format(props['tipoStruttura']))
        return None

In [None]:
df.loc[0]

In [None]:
n = 4000

In [None]:
if n is None:
    n = len(df)
try:
    db.begin()
    j = 0
    for i in df.index[:n]:
        tfidf_text = ' '.join(textual_analysis.top_feats_in_doc(m,  tf.get_feature_names(), i ,20))
        data = df.loc[i]
        node = create_struttura(i, tfidf_text, data)
        if node is not None:
            node.save()
        j +=1
        if j % 1000 == 0:
            print('processed {}'.format(i))
            db.commit()
            db.begin()
    db.commit()
except Exception as e:
    print('got exception ', e)
    db.rollback()

In [None]:

if n is None:
    n = len(df)
counter_src_not_found = 0
counter_dst_not_found = 0
j = 0
for i in df.index[:n]:
    knn = next(min_knn_indices)
    edges = [(i, I[i][o], round(D[i][o],2)) for o in range(knn)]
    for (src, dst, score) in edges:
        src_node = Node.nodes.get_or_none(id_s=src)
        dst_node = Node.nodes.get_or_none(id_s=dst)

        j +=1
        if src_node is not None and dst_node is not None:
            src_node.semantic_connection.connect(dst_node, {'score': score})


        elif src_node is None:
            counter_src_not_found +=1
        else:
            counter_dst_not_found +=1
        if j % 1000 == 0:
            print('processed {}'.format(j))

print('source node not found {}'.format(counter_src_not_found))
print('dest node not found {}'.format(counter_dst_not_found))


In [None]:
import gc
gc.collect()

In [None]:
r = PublicAdministration.nodes.get_or_none(id_s=0)

In [None]:
type(r)

In [None]:
r = SemanticRel()