Script de medición y borrado de las colecciones en cluster MongoDB
TFM Daniel Herranz Segundo

In [84]:
import pandas as pd
import numpy as np
import os
import json
import random
import pymongo
from pymongo import MongoClient
import time
from pprint import pprint
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
import matplotlib.pyplot as plt
import psutil

In [98]:
def save_results_to_csv(results,file):
    #Guardamos los resultados en csv
    from datetime import datetime
    
    csv_df = pd.DataFrame(results, columns=['Registros', 'Tiempo', 'CPU','Memoria'])
    dia = datetime.now().strftime("%d%m%Y_%H_%M_%S")
    csv_df.to_csv(file.format(str(dia)))

In [99]:
resultados_etl = '../Results/MongoDB/MongoDB_Delete_MultipleCollection_{}.csv'

In [100]:
#repeticiones
repeats = 100

In [101]:
write_concern = 3

#Conexion unica a colecciones
connection = MongoClient('localhost', 27017, w=3) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

#Creamos una base de datos para el trabajo llamada TFM
tfm_mongo_db = connection["tfm_mongo_database"]

## Creación de las colecciones por cada dominio

In [102]:
CustomerProfileCollection = tfm_mongo_db["CustomerProfileCollection"]
CurrentAccountCollection = tfm_mongo_db["CurrentAccountCollection"]
PositionKeepingCollection = tfm_mongo_db["PositionKeepingCollection"]
print(connection.list_database_names())

['admin', 'config', 'local', 'tfm_mongo_database']


## Borrado de los documentos y simulación en cascada

In [103]:
def removePartyIdInDifferentsDomains(partyId):
    borrados = 0
    #Baja de un cliente -Borrado de todos los documentos relacionados con PartyId (CurrentAccount + CustomerProfile + PositionKeeping (AccountId))
    currents_accounts_to_delete = []
    resultado = CurrentAccountCollection.find({'PartyId': partyId})
    for doc in resultado:
        # Borramos todos los registros de Balance asociados a cuentas de ese PartyId
        cad = CurrentAccountCollection.delete_many({'AccountId' : doc.get('AccountId')})
        currents_accounts_to_delete.append(doc.get('AccountId'))
        borrados = borrados + cad.deleted_count
    currents_accounts_to_delete = list(dict.fromkeys(currents_accounts_to_delete))
    for accountId_to_remove in currents_accounts_to_delete:
        pkd = PositionKeepingCollection.delete_many({'AccountId': accountId_to_remove})
        borrados = borrados + pkd.deleted_count
        cpd = CustomerProfileCollection.delete_many({'PartyId' : partyId})
        borrados = borrados + cpd.deleted_count
    return borrados

In [104]:
def deletePartyIdInDatabase(partyIdList):
    registers = [] 
    for partyId in range(0,repeats):      
        time_inicial = time.time()
        removePartyIdInDifferentsDomains(random.choice(partyIdList))
        time_final = time.time()
        data_time_collection = round(time_final - time_inicial,3)
        used_cpu = psutil.cpu_percent()
        mem_used = psutil.virtual_memory().percent
        registers.append((CustomerProfileCollection.count_documents({}),data_time_collection,used_cpu,mem_used))
    return registers

In [105]:
#Seleccionamos todos los partyId existentes en la coleccion CustomerProfile para borrarlos iterativamente
partyIdList = []
resultado_query = CustomerProfileCollection.find({},{'PartyId': 1})
for customer in resultado_query:
    partyIdList.append(customer.get('PartyId'))
partyIdList = list(dict.fromkeys(partyIdList))
len(partyIdList)

999891

In [106]:
#Estado actual de las diferentes colecciones
print("CustomerProfileCollection",CustomerProfileCollection.count_documents({}),"documentos antes de borrado")
print("CurrentAccountCollection",CurrentAccountCollection.count_documents({}),"documentos antes de borrado")
print("PositionKeepingCollection",PositionKeepingCollection.count_documents({}),"documentos antes de borrado")

CustomerProfileCollection 999891 documentos antes de borrado
CurrentAccountCollection 999892 documentos antes de borrado
PositionKeepingCollection 999891 documentos antes de borrado


In [107]:
#Iteramos sobre partyId al azar (para evitar sesgo de indices) y borramos todo lo relacionado en otras colecciones
registers = deletePartyIdInDatabase(partyIdList)

In [108]:
#Estado actual de las diferentes colecciones
print("CustomerProfileCollection",CurrentAccountCollection.count_documents({}),"documentos post borrado")
print("CurrentAccountCollection",CurrentAccountCollection.count_documents({}),"documentos post borrado")
print("PositionKeepingCollection",PositionKeepingCollection.count_documents({}),"documentos post borrado")

CustomerProfileCollection 999792 documentos post borrado
CurrentAccountCollection 999792 documentos post borrado
PositionKeepingCollection 999791 documentos post borrado


In [110]:
registers[10:20]

[(999880, 0.36, 17.9, 67.7),
 (999879, 0.362, 18.7, 67.7),
 (999878, 0.376, 18.7, 67.7),
 (999877, 0.361, 17.7, 67.7),
 (999876, 0.367, 17.4, 67.7),
 (999875, 0.356, 17.4, 67.7),
 (999874, 0.367, 16.8, 67.7),
 (999873, 0.355, 17.1, 67.7),
 (999872, 0.367, 17.2, 67.7),
 (999871, 0.362, 25.3, 67.7)]

In [111]:
save_results_to_csv(registers,resultados_etl)

In [112]:
#Cierre de la conexion
connection.close()