In [24]:
from utils import textual_analysis
import pandas as pd
import numpy as np

from time import time
import pickle
import scipy

In [25]:
df_file_name = "data/oggetti_aggregati.pickle"

t = time()
df = pd.read_pickle(df_file_name)
total = time() - t

print("Dataset loaded in %0.3f sec" % total)

df.info()

Dataset loaded in 0.299 sec
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307265 entries, 0 to 307264
Data columns (total 3 columns):
codiceFiscaleStruttura    307265 non-null object
oggetto                   307265 non-null object
tipoStruttura             307265 non-null object
dtypes: object(3)
memory usage: 7.0+ MB


In [26]:
tf = pickle.load(open("data/n6/tf.pickle", "rb"))
m = scipy.sparse.load_npz('data/n6/sparse_matrix_tfidf.npz')

I = np.load("data/n6/normalized_matrix_embedding_52004terms_weigthedTrue.npysimilarity_I_1000_k.npy")
D = np.load("data/n6/normalized_matrix_embedding_52004terms_weigthedTrue.npysimilarity_D_1000_k.npy")

In [27]:
def getMinOrDefault(array, default_value, threshold = 0.6):
    """
    return the index of the smallest element compared to the threshold
    """
    res = D.shape[1]
    try:
        res = np.min(np.where(array < threshold))
    except:
        return res
    return res

In [28]:
t0 = time()
# filter the top k
d = D.shape
min_knn_indices = (getMinOrDefault(row, d) for row in D)
total = time() - t0

print("Min indices done in %0.3f sec" % total)

Min indices done in 0.058 sec


In [29]:
from neomodel import db
from neomodel import *
db.set_connection('bolt://neo4j:password@localhost:7687')
config.MAX_POOL_SIZE = 100

In [30]:
def create_struttura(id_s, knn, best_terms, series_oggetti):
    struttura = Struttura(
        id_s = id_s,
        codice_fiscale = series_oggetti['codiceFiscaleStruttura'].strip(),
        tipo_struttura = series_oggetti['tipoStruttura'],
        num_outlinks = knn,
        oggetto = series_oggetti['oggetto'],
        relevant_terms = best_terms
    )
    return struttura

In [31]:
class SemanticRel(StructuredRel):
    score = FloatProperty(default=0)
    
class TfIDfRel(StructuredRel):
    score = FloatProperty(default=0)

class Struttura(StructuredNode):
    id_s = IntegerProperty(index=True)
    codice_fiscale = StringProperty(index=True)
    tipo_struttura = StringProperty(index=True)
    num_outlinks = IntegerProperty(index=True)
    oggetto = StringProperty()
    relevant_terms = StringProperty(index=True)
    
    semantic_rel = RelationshipTo('Struttura', 'SEMANTIC', model=SemanticRel)
    tfidf_rel = RelationshipTo('Struttura', 'TF_IDF', model=TfIDfRel)
    

In [10]:
def write_nodes():
    try:
        db.begin()
        i = 0
        for i in df.index:
            tfidf_text = ' '.join(textual_analysis.top_feats_in_doc(m,  tf.get_feature_names(), i ,20))
            data = df.iloc[i]
            knn = next(min_knn_indices)
            struttura = create_struttura(i, knn, tfidf_text, data)
            struttura.save()
            i +=1 
            if i % 1000 == 0:
                print('processed {}'.format(i))
                db.commit()
                return
                db.begin()
        db.commit()
    except Exception as e:
        print('got exception ', e)
        db.rollback()

In [37]:
def write_edges():
#     try:
    db.begin()
    i = 0
    for i in df.index:
        tfidf_text = ' '.join(textual_analysis.top_feats_in_doc(m,  tf.get_feature_names(), i ,20))
        data = df.iloc[i]
        knn = next(min_knn_indices)
        edges = [(i, I[i][o], round(D[i][o],2)) for o in range(knn)]
        for (src, dst, score) in edges:
            src_node = Struttura.nodes.get_or_none(id_s=src)
            dst_node = Struttura.nodes.get_or_none(id_s=dst)
            if src_node is not None and dst_node is not None:
                src_node.semantic_rel.connect(dst_node, {'score': score})
            i +=1 
            if i % 1000 == 0:
                print('processed {}'.format(i))
                db.commit()
                return
                db.begin()
        db.commit()
        
#     except Exception as e:
#         print('got exception ', e)
#         db.rollback()          

In [None]:
write_nodes()

In [38]:
write_edges()

processed 1000


In [36]:
Struttura.nodes.get_or_none(id_s=1)

<Struttura: {'id_s': 1, 'codice_fiscale': '', 'tipo_struttura': 'AGG', 'num_outlinks': 1000, 'oggetto': 'servizio 22202 organizzazione incontri daffari limitato ai servizi personalizzati agiuntivi albergo visto suto con autista transfer e CIP  cosmoprof 47 dicembre 2017 ORDINE DACQUISTO Parandis per Prenotazione e riservazione albergo con visto per Sig Drusian di azienda Poliplast  SERVIZIO 22202 Organizzazione incontri daffari  Acquisto biglietti aerei TeheranIsfahanTeheran e prenotazione albergo a Isfahan per il vice responsabile ufficio ICE Teheran e TA incaricato in occasione fiera Isfahan Gold 2017 a Isfahan Prenotazione albergo per il Sig Antonio Passarelli in occasione fiera IranConMin 2017', 'relevant_terms': 'albergo:0.46 daffari:0.36 visto:0.29 prenotazione:0.28 incontri:0.21 sig:0.21 fiera:0.19 teheran:0.18 occasione:0.16 organizzazione:0.16 cip:0.16 cosmoprof:0.15 limitato:0.15 incaricato:0.15 vice:0.14 gold:0.13 ice:0.13 ta:0.12 transfer:0.12 autista:0.12', 'id': 1}>

In [16]:
for i in df.index:
    tfidf_text = ' '.join(textual_analysis.top_feats_in_doc(m,  tf.get_feature_names(), i ,20))
    data = df.iloc[i]
    knn = next(min_knn_indices)
    struttura = create_struttura(i, knn, tfidf_text, data)
    edges = [(i, I[i][o], round(D[i][o],2)) for o in range(knn)]
    break
#     s = from_oggettiAggregati_to_Struttura(i, knn, tfidf_text, data)
#     edges = (Edge(i, I[i][o], round(D[i][o],2)) for o in range(knn))
#     if(i == 10):
#         break

In [17]:
edges

[(0, 0, 1.0),
 (0, 307131, 0.97),
 (0, 260398, 0.95),
 (0, 302157, 0.95),
 (0, 260397, 0.95),
 (0, 298854, 0.95),
 (0, 298372, 0.95),
 (0, 302868, 0.95),
 (0, 298493, 0.95),
 (0, 300846, 0.94),
 (0, 305346, 0.94),
 (0, 306831, 0.94),
 (0, 299601, 0.94),
 (0, 305124, 0.94),
 (0, 299723, 0.94),
 (0, 300885, 0.94),
 (0, 305489, 0.94),
 (0, 301966, 0.94),
 (0, 307003, 0.94),
 (0, 296705, 0.94),
 (0, 301995, 0.94),
 (0, 305984, 0.94),
 (0, 300315, 0.94),
 (0, 297697, 0.94),
 (0, 301339, 0.94),
 (0, 299561, 0.94),
 (0, 306318, 0.94),
 (0, 100369, 0.94),
 (0, 305409, 0.94),
 (0, 296824, 0.94),
 (0, 300879, 0.94),
 (0, 303802, 0.94),
 (0, 299935, 0.94),
 (0, 297370, 0.94),
 (0, 301381, 0.94),
 (0, 306696, 0.94),
 (0, 305493, 0.94),
 (0, 184930, 0.94),
 (0, 299445, 0.94),
 (0, 301664, 0.94),
 (0, 304198, 0.94),
 (0, 302903, 0.94),
 (0, 297739, 0.94),
 (0, 306186, 0.94),
 (0, 300890, 0.94),
 (0, 302294, 0.94),
 (0, 304735, 0.94),
 (0, 297823, 0.94),
 (0, 301317, 0.94),
 (0, 306627, 0.94),
 (0, 3

In [None]:
struttura.save()

In [None]:
Struttura.nodes.get(id=0)