Script de select datos MySQL en cluster multidomain

In [27]:
!pip install mysql-connector==2.1.7
!pip install pandas
!pip install sqlalchemy
#requiere instalación adicional, consultar https://github.com/PyMySQL/mysqlclient
!pip install mysqlclient
!pip install numpy
!pip install pymysql



In [58]:
import pandas as pd
import numpy as np
import os
import json
import random
import pymongo
from pymongo import MongoClient
import time
from pprint import pprint
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
import matplotlib.pyplot as plt
import psutil
import uuid

In [59]:
#Los resultados de medidas de tiempo en carga por dominios se almacenan en estos objetos.
#Se itera durante 100 iteraciones para sacar medias
#repeticiones
repeats = 100

In [60]:
#Ficheros de salida
resultados_mongodb_select = '../Results/MongoDB/MongoDBSelect_test_{}.csv'

In [61]:
def save_results_to_csv(results,file):
    #Guardamos los resultados en csv
    from datetime import datetime
    
    csv_df = pd.DataFrame(results, columns=['Registros', 'Tiempo', 'CPU','Memoria'])
    dia = datetime.now().strftime("%d%m%Y_%H_%M_%S")
    csv_df.to_csv(file.format(str(dia)))

In [62]:
#Conexion unica a colecciones
connection = MongoClient('localhost', 27017, w=3) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

#Creamos una base de datos para el trabajo llamada TFM
tfm_mongo_db = connection["tfm_mongo_database"]

In [63]:
#Generamos indices para adecuar el rendimiento
CustomerProfileCollection.create_index("PartyId", unique = True, dropDups = True)
CurrentAccountCollection.create_index("AccountId", unique = True, dropDups = True)
PositionKeepingCollection.create_index("AccountId", unique = True, dropDups = True)

'AccountId_1'

In [64]:
CustomerProfileCollection = tfm_mongo_db["CustomerProfileCollection"]
CurrentAccountCollection = tfm_mongo_db["CurrentAccountCollection"]
PositionKeepingCollection = tfm_mongo_db["PositionKeepingCollection"]
print(connection.list_database_names())

['admin', 'config', 'local', 'tfm_mongo_database']


# Select test multidomain

In [65]:
partyId_list = []
for partyId in CustomerProfileCollection.find({},{ "_id": 0, "PartyId": 1}):
    partyId_list.append(partyId)
len(partyId_list)

1000000

In [66]:
    select_query = """SELECT * FROM CustomerProfileDomainSchema.CustomerProfile cp
INNER JOIN CurrentAccountDomainSchema.CurrentAccount ca ON ca.PartyId = cp.PartyId
INNER JOIN CurrentAccountDomainSchema.AccountInfo ai ON ai.AccountId = ca.AccountId
INNER JOIN PositionKeepingDomainSchema.PositionKeeping pk ON pk.AccountId = ca.AccountId
INNER JOIN PositionKeepingDomainSchema.Amount am ON am.AmountId = pk.AmountId
INNER JOIN PositionKeepingDomainSchema.CreditLine cl ON cl.CreditLineId = pk.CreditLineId
INNER JOIN PositionKeepingDomainSchema.Currency cr ON cr.CurrencyId = am.CurrencyId
WHERE cp.PartyId = {}"""

In [73]:
def generate_select_pipeline():
    match = {"$match": {"PartyId": "{}".format(random.choice(partyId_list)["PartyId"]) } }
    lookup_ca = {"$lookup": {"from": "CurrentAccountCollection", "localField": "PartyId", "foreignField": "PartyId", "as": "CurrentAccount"}}
    #unwind_ca = {"$unwind": "$CurrentAccount"}
    lookup_pk = {"$lookup": {"from": "PositionKeepingCollection", "localField": "CurrentAccount.AccountId", "foreignField": "AccountId", "as": "PositionKeeping"}}
    limit = {"$limit": 5}

    #pipeline = [match, lookup_ca, unwind_ca, lookup_pk, limit]                                           
    pipeline = [match, lookup_ca, lookup_pk]
    #print(pipeline)
    return pipeline

In [74]:
time_inicial = 0
time_final = 0
registers = []
 
for iteracion in range(0,repeats): 
    time_inicial = time.time()
    CustomerProfileCollection.aggregate(generate_select_pipeline())
    time_final = time.time() 
    used_cpu = psutil.cpu_percent()
    mem_used = psutil.virtual_memory().percent
    # Tupla con numero de registros, tiempo parcial de la transacción y tiempo acumulado de trxs
    total_time = round(time_final - time_inicial,3)
    registers.append((iteracion + 1, total_time ,used_cpu, mem_used))

In [75]:
registers[10:20]

[(11, 0.357, 33.6, 71.1),
 (12, 0.352, 36.2, 71.1),
 (13, 0.352, 34.0, 71.1),
 (14, 0.355, 36.8, 71.2),
 (15, 0.351, 34.0, 71.2),
 (16, 0.358, 34.4, 71.2),
 (17, 0.355, 34.9, 71.2),
 (18, 0.352, 34.4, 71.2),
 (19, 0.35, 33.8, 71.2),
 (20, 0.351, 33.5, 71.2)]

In [76]:
#Guardamos los resultados Customer Profile
save_results_to_csv(registers,resultados_mongodb_select)

In [77]:
connection.close()
print('Conexion cerrada')

Conexion cerrada
