Script de medición y carga de las colecciones en cluster MongoDB
TFM Daniel Herranz Segundo

In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
import pymongo
from pymongo import MongoClient
import time
from pprint import pprint
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
import matplotlib.pyplot as plt
import psutil
import uuid

In [2]:
#Variables de los ficheros de datos salida
CurrentAccountCollection_file_out = '../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection.json'
PositionKeepingCollection_file_out = '../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection.json'
CustomerProfileCollection_file_out = '../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection.json'

CurrentAccountCollection_sample_out = '../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection_sample.json'
PositionKeepingCollection_sample_out = '../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection_sample.json'
CustomerProfileCollection_sample_out = '../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection_sample.json'

In [3]:
def save_results_to_csv(results,file,write_concerns):
    #Guardamos los resultados en csv
    from datetime import datetime
    dia = datetime.now().strftime("%d%m%Y_%H_%M_%S") 
    data = results
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    results.to_csv(file.format(write_concerns, str(dia)))

In [4]:
resultados_etl_CutomerProfileCollection = '../Results/MongoDB/MongoDB_Bulk_Insert_{}_WC_CustomerProfileCollection_{}.csv'
resultados_etl_PositionKeepingCollection = '../Results/MongoDB/MongoDB_Bulk_Insert_{}_WC_PositionKeepingCollection_{}.csv'
resultados_etl_CurrentAccountCollection = '../Results/MongoDB/MongoDB_Bulk_Insert_{}_WC_CurrentAccountCollection_{}.csv'

In [5]:
#repeticiones
repeats = 1000
test_wc = False

In [6]:
#Conexion unica a colecciones
connection = MongoClient('localhost', 27017, w=3) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

#Creamos una base de datos para el trabajo llamada TFM
tfm_mongo_db = connection["tfm_mongo_database"]

## Creación de las colecciones por cada dominio

In [7]:
CustomerProfileCollection = tfm_mongo_db["CustomerProfileCollection"]
CurrentAccountCollection = tfm_mongo_db["CurrentAccountCollection"]
PositionKeepingCollection = tfm_mongo_db["PositionKeepingCollection"]
print(connection.list_database_names())

['admin', 'config', 'local', 'tfm_mongo_database']


## Carga de los documentos por cada dominio

### Test de carga CustomerProfileCollection

In [8]:
#Carga de la información de dataframes por entidades
CustomerProfileCollection_df = pd.read_json(CustomerProfileCollection_sample_out) #Carga Sample
CurrentAccountCollection_df = pd.read_json(CurrentAccountCollection_sample_out) #Carga Sample
PositionKeepingCollection_df = pd.read_json(PositionKeepingCollection_sample_out) #Carga Sample

#CustomerProfileCollection_df = pd.read_json(CustomerProfileCollection_file_out) 
#CurrentAccountCollection_df = pd.read_json(CurrentAccountCollection_file_out) 
#PositionKeepingCollection_df = pd.read_json(PositionKeepingCollection_file_out) 

In [9]:
print("CustomerProfileCollection_df",len(CustomerProfileCollection_df))
print("CurrentAccountCollection_df",len(CurrentAccountCollection_df))
print("PositionKeepingCollection_df",len(PositionKeepingCollection_df))

CustomerProfileCollection_df 10000
CurrentAccountCollection_df 10000
PositionKeepingCollection_df 10000


In [10]:
for x in range(17): # Generamos un millon de registros aprox
    CustomerProfileCollection_df = CustomerProfileCollection_df.append(CustomerProfileCollection_df.sample(frac=0.40,replace = True))
    CurrentAccountCollection_df = CurrentAccountCollection_df.append(CurrentAccountCollection_df.sample(frac=0.40,replace = True))
    PositionKeepingCollection_df = PositionKeepingCollection_df.append(PositionKeepingCollection_df.sample(frac=0.40,replace = True))

In [11]:
num_documentos = 1000000

In [12]:
CustomerProfileCollection_df = CustomerProfileCollection_df.sample(num_documentos)
print(len(CustomerProfileCollection_df))
CustomerProfileCollection_df.sample(1)

1000000


Unnamed: 0,PartyId,PartyNumber,PartyType,Name,FullLegalName,LegalStructure,BeneficialOwnership,AccountRole,EmailAddress,Phone,Address
9131,458a2bc2-b7fb-11ec-a2b2-9fdc6c308163,397,sol,Jennifer,Elena Bosco,Miss,0,UK.OBIE.Principal,wolf.suzanne@example.com,1-137-594-03,"[{'AddressType': 'West', 'AddressLine': '7118 ..."


In [13]:
CurrentAccountCollection_df = CurrentAccountCollection_df.sample(num_documentos)
print(len(CurrentAccountCollection_df))
CurrentAccountCollection_df.sample(1)

1000000


Unnamed: 0,AccountId,PartyId,Status,StatusUpdateDateTime,AccountType,NickName,OpeningDate,AccountSubType,AccountInfo
450,7673ac54-b7fb-11ec-a2b2-9fdc6c308163,45853f5e-b7fb-11ec-a2b2-9fdc6c308163,Disabled,1973-09-30 20:22:10.00,Business,whiteboard bricks-and-clicks metrics,1985-01-07 23:28:19.00,SAI,"[{'SchemeName': 'UK.business', 'Identification..."


In [14]:
PositionKeepingCollection_df = PositionKeepingCollection_df.sample(num_documentos)
print(len(PositionKeepingCollection_df))
PositionKeepingCollection_df.sample(1)

1000000


Unnamed: 0,DateTime,CreditDebitIndicator,Type,Amount,CreditLine,AccountId
7025,2008-06-10 05:14:28,Credit,Business,"{'Currency': '[{""Code"":""EUR"",""Description"":""Do...","{'Included': '[0]', 'Type': '[""Business""]', 'A...",7678f858-b7fb-11ec-a2b2-9fdc6c308163


In [15]:
sample_mode = False #En cargas masivas reales, poner a false

if(sample_mode):
    list_partyId = CustomerProfileCollection_df['PartyId'].map(lambda x: str(uuid.uuid1()))
    list_accountid = CurrentAccountCollection_df['AccountId'].map(lambda x: str(uuid.uuid1()))
    CustomerProfileCollection_df['PartyId'] = list_partyId
    CurrentAccountCollection_df['AccountId'] = list_accountid
    CurrentAccountCollection_df['PartyId'] = list_partyId    
    PositionKeepingCollection_df['AccountId'] = list_accountid

print("CustomerProfileCollection_df",len(CustomerProfileCollection_df))
print("CurrentAccountCollection_df",len(CurrentAccountCollection_df))
print("PositionKeepingCollection_df",len(PositionKeepingCollection_df))

CustomerProfileCollection_df 1000000
CurrentAccountCollection_df 1000000
PositionKeepingCollection_df 1000000


In [16]:
#Cargas Masiva con Many
def loadCollectionDataFrame(df,collection):
    registers = []
    grps = df.groupby(df.index / 1000)    
    iter = 0;
    for i,group_df in grps: 
        #Insertamos en bloques de 1000 para igualar a otros motores    
        time_inicial = time.time()
        collection.insert_many(group_df.to_dict('records'))
        time_final = time.time()
        data_time_collection = round(time_final - time_inicial,3)
        used_cpu = psutil.cpu_percent()
        mem_used = psutil.virtual_memory().percent
        registers.append((iter,data_time_collection,used_cpu,mem_used))
        iter += 1000;
    return registers

In [17]:
#Borramos los valores actuales de la colección
connection["tfm_mongo_database"].CustomerProfileCollection.drop()
print('Borrada colleción CustomerProfileCollection')

Borrada colleción CustomerProfileCollection


In [18]:
CustomerProfileCollection.count_documents({})

0

In [19]:
registers_customer_profile = loadCollectionDataFrame(CustomerProfileCollection_df,CustomerProfileCollection)

In [20]:
CustomerProfileCollection.count_documents({})

1000000

In [21]:
data = registers_customer_profile
results_CustomerProfile_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
write_concerns = 3
save_results_to_csv(results_CustomerProfile_df,resultados_etl_CutomerProfileCollection, write_concerns)
results_CustomerProfile_df.sample(5)

Unnamed: 0,Registros,Tiempo,CPU,Memoria
6238,6238000,0.002,0.0,49.8
1965,1965000,0.004,0.0,41.7
8461,8461000,0.003,0.0,53.7
3195,3195000,0.005,50.0,44.2
380,380000,0.004,33.3,40.2


# CurrentAccountCollection ETL Load

In [22]:
#Borramos los valores actuales de la colección
connection["tfm_mongo_database"].CurrentAccountCollection.drop()
print('Borrada colleción CurrentAccountCollection')

Borrada colleción CurrentAccountCollection


In [23]:
CurrentAccountCollection.count_documents({})

0

In [24]:
registers_CurrentAccount = loadCollectionDataFrame(CurrentAccountCollection_df,CurrentAccountCollection)

In [25]:
CurrentAccountCollection.count_documents({})

1000000

In [26]:
data = registers_CurrentAccount
results_CurrentAccount_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
write_concerns = 3
save_results_to_csv(results_CurrentAccount_df,resultados_etl_CurrentAccountCollection, write_concerns)
results_CurrentAccount_df.sample(5)

Unnamed: 0,Registros,Tiempo,CPU,Memoria
5712,5712000,0.004,75.0,63.5
5693,5693000,0.007,50.0,63.5
6665,6665000,0.002,50.0,63.4
7959,7959000,0.006,50.0,64.1
2768,2768000,0.004,33.3,60.6


## PositionKeepingCollection ETL Load

In [27]:
#Borramos los valores actuales de la colección
connection["tfm_mongo_database"].PositionKeepingCollection.drop()
print('Borrada colleción PositionKeepingCollection')

Borrada colleción PositionKeepingCollection


In [28]:
PositionKeepingCollection.count_documents({})

0

In [29]:
registers_PositionKeeping = loadCollectionDataFrame(PositionKeepingCollection_df,PositionKeepingCollection)

In [30]:
PositionKeepingCollection.count_documents({})

1000000

In [31]:
data = registers_PositionKeeping
results_pk_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
write_concerns = 3
save_results_to_csv(results_pk_df,resultados_etl_PositionKeepingCollection,write_concerns)
results_pk_df.sample(5)

Unnamed: 0,Registros,Tiempo,CPU,Memoria
2390,2390000,0.005,0.0,67.1
2608,2608000,0.004,25.0,67.1
6421,6421000,0.003,33.3,69.8
984,984000,0.005,66.7,65.9
5843,5843000,0.003,0.0,69.8


## Metricas con Write Concern = 2 (2 nodos de replica inmediata)

In [32]:
if(test_wc):
    write_concerns = 2

    #Conexion unica a colecciones
    connection = MongoClient('localhost', 27017, w=write_concerns) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

    #Creamos una base de datos para el trabajo llamada TFM
    tfm_mongo_db = connection["tfm_mongo_database"]

In [33]:
if(test_wc):
    #Borramos los valores actuales de la colección
    connection["tfm_mongo_database"].CustomerProfileCollection.drop()
    print('Borrada colleción CustomerProfileCollection')
    connection["tfm_mongo_database"].CurrentAccountCollection.drop()
    print('Borrada colleción CurrentAccountCollection')
    connection["tfm_mongo_database"].PositionKeepingCollection.drop()
    print('Borrada colleción PositionKeepingCollection')

Borrada colleción CustomerProfileCollection
Borrada colleción CurrentAccountCollection
Borrada colleción PositionKeepingCollection


In [34]:
if(test_wc):
    registers = loadCollectionDataFrame(CustomerProfileCollection_df,CustomerProfileCollection)
    data = registers
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    save_results_to_csv(results_df,resultados_etl_CutomerProfileCollection, write_concerns)

In [35]:
if(test_wc):
    registers = loadCollectionDataFrame(CurrentAccountCollection_df,CurrentAccountCollection)
    data = registers
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    save_results_to_csv(results_df,resultados_etl_CurrentAccountCollection, write_concerns)

In [36]:
if(test_wc):
    registers = loadCollectionDataFrame(PositionKeepingCollection_df,PositionKeepingCollection)
    data = registers
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    save_results_to_csv(results_df,resultados_etl_PositionKeepingCollection, write_concerns)

##  Metricas con Write Concern = 1 (sin replica inmediata)


In [37]:
if(test_wc):
    write_concerns = 1

    #Conexion unica a colecciones
    connection = MongoClient('localhost', 27017, w=write_concerns) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

    #Creamos una base de datos para el trabajo llamada TFM
    tfm_mongo_db = connection["tfm_mongo_database"]

In [38]:
if(test_wc):
    #Borramos los valores actuales de la colección
    connection["tfm_mongo_database"].CustomerProfileCollection.drop()
    print('Borrada colleción CustomerProfileCollection')
    connection["tfm_mongo_database"].CurrentAccountCollection.drop()
    print('Borrada colleción CurrentAccountCollection')
    connection["tfm_mongo_database"].PositionKeepingCollection.drop()
    print('Borrada colleción PositionKeepingCollection')

Borrada colleción CustomerProfileCollection
Borrada colleción CurrentAccountCollection
Borrada colleción PositionKeepingCollection


In [39]:
if(test_wc):
    registers = loadCollectionDataFrame(CustomerProfileCollection_df,CustomerProfileCollection)
    data = registers
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    save_results_to_csv(results_df,resultados_etl_CutomerProfileCollection, write_concerns)

In [40]:
if(test_wc):
    registers = loadCollectionDataFrame(CurrentAccountCollection_df,CurrentAccountCollection)
    data = registers
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    save_results_to_csv(results_df,resultados_etl_CurrentAccountCollection, write_concerns)

In [41]:
if(test_wc):
    registers = loadCollectionDataFrame(PositionKeepingCollection_df,PositionKeepingCollection)
    data = registers
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    save_results_to_csv(results_df,resultados_etl_PositionKeepingCollection, write_concerns)

In [42]:
#Cierre de la conexion
connection.close()