Script de medición y borrado de las colecciones en cluster MongoDB
TFM Daniel Herranz Segundo

In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
import pymongo
from pymongo import MongoClient
import time
from pprint import pprint
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
import matplotlib.pyplot as plt
import psutil

In [52]:
def save_results_to_csv(results,file):
    #Guardamos los resultados en csv
    from datetime import datetime
    dia = datetime.now().strftime("%d%m%Y_%H_%M_%S") 
    data = results
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    results.to_csv(file.format(str(dia), write_concern))

In [53]:
resultados_etl = '../Results/MongoDB/MongoDB_Delete_MultipleCollection_{}_wc_{}.csv'

In [4]:
#repeticiones
repeats = 1000

# Borrado Write Concern a 3 (Replica inmediata en 3 nodos)

In [54]:
write_concern = 3

#Conexion unica a colecciones
connection = MongoClient('localhost', 27017, w=3) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

#Creamos una base de datos para el trabajo llamada TFM
tfm_mongo_db = connection["tfm_mongo_database"]

## Creación de las colecciones por cada dominio

In [6]:
CustomerProfileCollection = tfm_mongo_db["CustomerProfileCollection"]
CurrentAccountCollection = tfm_mongo_db["CurrentAccountCollection"]
PositionKeepingCollection = tfm_mongo_db["PositionKeepingCollection"]
print(connection.list_database_names())

['admin', 'config', 'local', 'tfm_mongo_database']


## Borrado de los documentos y simulación en cascada

In [27]:
def removePartyIdInDifferentsDomains(partyId):
    #Baja de un cliente -Borrado de todos los documentos relacionados con PartyId (CurrentAccount + CustomerProfile + PositionKeeping (AccountId))
    currents_accounts_to_delete = []
    resultado = CurrentAccountCollection.find({"PartyId": partyId})
    for doc in resultado:
        # Borramos todos los registros de Balance asociados a cuentas de ese PartyId
        currents_accounts_to_delete.append(doc.get('AccountId'))
    currents_accounts_to_delete = list(dict.fromkeys(currents_accounts_to_delete))
    for accountId_to_remove in currents_accounts_to_delete:
        PositionKeepingCollection.delete_many({"AccountId": accountId_to_remove})
        CustomerProfileCollection.delete_many({"PartyId" : partyId})

In [50]:
#Cargas Masiva con Many
def deletePartyIdInDatabase(partyIdList):
    registers = [] 
    for partyId in range(0,len(partyIdList)):      
        time_inicial = time.time()
        removePartyIdInDifferentsDomains(partyId)
        time_final = time.time()
        data_time_collection = round(time_final - time_inicial,3)
        used_cpu = psutil.cpu_percent()
        mem_used = psutil.virtual_memory().percent
        registers.append((CustomerProfileCollection.count_documents({}),data_time_collection,used_cpu,mem_used))
    return registers

In [46]:
#Seleccionamos todos los partyId existentes en la coleccion CustomerProfile para borrarlos iterativamente
partyIdList = []
resultado_query = CustomerProfileCollection.find({},{'PartyId': 1})
for customer in resultado_query:
    partyIdList.append(customer.get('PartyId'))
partyIdList = list(dict.fromkeys(partyIdList))
len(partyIdList)

944

In [47]:
#Estado actual de las diferentes colecciones
print("CustomerProfileCollection",CustomerProfileCollection.count_documents({}),"documentos antes de borrado")
print("CurrentAccountCollection",CurrentAccountCollection.count_documents({}),"documentos antes de borrado")
print("PositionKeepingCollection",PositionKeepingCollection.count_documents({}),"documentos antes de borrado")

CustomerProfileCollection 999998 documentos antes de borrado
CurrentAccountCollection 999998 documentos antes de borrado
PositionKeepingCollection 1000000 documentos antes de borrado


In [48]:
#Iteramos sobre partyId al azar (para evitar sesgo de indices) y borramos todo lo relacionado en otras colecciones
registers = deletePartyIdInDatabase(partyIdList)

KeyboardInterrupt: 

In [16]:
data = registers
results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
save_results_to_csv(results_df,resultados_etl)
results_df.sample(5)

Unnamed: 0,Registros,Tiempo,CPU,Memoria
252,252000,0.07,34.9,38.7
939,939000,0.014,66.7,45.3
955,955000,0.014,45.5,45.5
725,725000,0.02,35.7,42.2
633,633000,0.034,35.0,41.6


In [None]:
#Estado actual de las diferentes colecciones
print("CustomerProfileCollection",CurrentAccountCollection.count_documents({}),"documentos post borrado")
print("CurrentAccountCollection",CurrentAccountCollection.count_documents({}),"documentos post borrado")
print("PositionKeepingCollection",PositionKeepingCollection.count_documents({}),"documentos post borrado")

# Borrado con Write Concern a 2 (replica inmediata en 2 nodos)

In [None]:
write_concern = 2

#Conexion unica a colecciones
connection = MongoClient('localhost', 27017, w=write_concern) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

#Creamos una base de datos para el trabajo llamada TFM
tfm_mongo_db = connection["tfm_mongo_database"]

In [49]:
#Cierre de la conexion
connection.close()