Script de medición y carga de las colecciones en cluster MongoDB
TFM Daniel Herranz Segundo

In [38]:
import pandas as pd
import numpy as np
import os
import json
import random
import pymongo
from pymongo import MongoClient
import time
from pprint import pprint
from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne
import matplotlib.pyplot as plt
import psutil

In [39]:
#Variables de los ficheros de datos salida
CurrentAccountCollection_file_out = '../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection.json'
PositionKeepingCollection_file_out = '../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection.json'
CustomerProfileCollection_file_out = '../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection.json'

CurrentAccountCollection_sample_out = '../MockData/MongoDB/CurrentAccountCollection/CurrentAccountCollection_sample.json'
PositionKeepingCollection_sample_out = '../MockData/MongoDB/PositionKeepingCollection/PositionKeepingCollection_sample.json'
CustomerProfileCollection_sample_out = '../MockData/MongoDB/CustomerProfileCollection/CustomerProfileCollection_sample.json'

In [54]:
def save_results_to_csv(results,file,write_concerns):
    #Guardamos los resultados en csv
    from datetime import datetime
    dia = datetime.now().strftime("%d%m%Y_%H_%M_%S") 
    data = results
    results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
    results.to_csv(file.format(write_concerns, str(dia)))

In [41]:
resultados_etl_CutomerProfileCollection = '../Results/MongoDB/MongoDB_Bulk_Insert_{}_WC_CustomerProfileCollection_{}.csv'
resultados_etl_PositionKeepingCollection = '../Results/MongoDB/MongoDB_Bulk_Insert_{}_WC_PositionKeepingCollection_{}.csv'
resultados_etl_CurrentAccountCollection = '../Results/MongoDB/MongoDB_Bulk_Insert_{}_WC_CurrentAccountCollection_{}.csv'

In [42]:
#repeticiones
repeats = 1000


In [43]:
#Conexion unica a colecciones
connection = MongoClient('localhost', 27017, w=3) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

#Creamos una base de datos para el trabajo llamada TFM
tfm_mongo_db = connection["tfm_mongo_database"]

## Creación de las colecciones por cada dominio

In [44]:
CustomerProfileCollection = tfm_mongo_db["CustomerProfileCollection"]
CurrentAccountCollection = tfm_mongo_db["CurrentAccountCollection"]
PositionKeepingCollection = tfm_mongo_db["PositionKeepingCollection"]
print(connection.list_database_names())

['admin', 'config', 'local', 'tfm_mongo_database']


## Carga de los documentos por cada dominio

### Test de carga CustomerProfileCollection

In [45]:
#Carga de la información de dataframes por entidades
CustomerProfileCollection_df = pd.read_json(CustomerProfileCollection_sample_out) #Carga Sample
CurrentAccountCollection_df = pd.read_json(CurrentAccountCollection_sample_out) #Carga Sample
PositionKeepingCollection_df = pd.read_json(PositionKeepingCollection_sample_out) #Carga Sample

In [46]:
print("CustomerProfileCollection_df",len(CustomerProfileCollection_df))
print("CurrentAccountCollection_df",len(CurrentAccountCollection_df))
print("PositionKeepingCollection_df",len(PositionKeepingCollection_df))

CustomerProfileCollection_df 1000
CurrentAccountCollection_df 10000
PositionKeepingCollection_df 10000


In [47]:
for x in range(17): # Generamos un millon de registros aprox
    CustomerProfileCollection_df = CustomerProfileCollection_df.append(CustomerProfileCollection_df.sample(frac=0.55,replace = True))
    CurrentAccountCollection_df = CurrentAccountCollection_df.append(CurrentAccountCollection_df.sample(frac=0.32,replace = True))
    PositionKeepingCollection_df = PositionKeepingCollection_df.append(PositionKeepingCollection_df.sample(frac=0.32,replace = True))

CustomerProfileCollection_df = CustomerProfileCollection_df.iloc[:1000000,]
CurrentAccountCollection_df = CurrentAccountCollection_df.iloc[:1000000,]
PositionKeepingCollection_df = PositionKeepingCollection_df.iloc[:1000000,]

print("CustomerProfileCollection_df",len(CustomerProfileCollection_df))
print("CurrentAccountCollection_df",len(CurrentAccountCollection_df))
print("PositionKeepingCollection_df",len(PositionKeepingCollection_df))

CustomerProfileCollection_df 1000000
CurrentAccountCollection_df 1000000
PositionKeepingCollection_df 1000000


In [48]:
#Cargas Masiva con Many
def loadCollectionDataFrame(df,collection):
    registers = []
    grps = df.groupby(df.index / 1000)    
    iter = 0;
    for i,group_df in grps: 
        #Insertamos en bloques de 1000 para igualar a otros motores    
        time_inicial = time.time()
        collection.insert_many(group_df.to_dict('records'))
        time_final = time.time()
        data_time_collection = round(time_final - time_inicial,3)
        used_cpu = psutil.cpu_percent()
        mem_used = psutil.virtual_memory().percent
        registers.append((iter,data_time_collection,used_cpu,mem_used))
        iter += 1000;
    return registers

In [49]:
#Borramos los valores actuales de la colección
connection["tfm_mongo_database"].CustomerProfileCollection.drop()
print('Borrada colleción CustomerProfileCollection')

Borrada colleción CustomerProfileCollection


In [50]:
CustomerProfileCollection.count_documents({})

0

In [51]:
registers_customer_profile = loadCollectionDataFrame(CustomerProfileCollection_df,CustomerProfileCollection)

In [52]:
CustomerProfileCollection.count_documents({})

1000000

In [55]:
data = registers_customer_profile
results_CustomerProfile_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
write_concerns = 3
save_results_to_csv(results_CustomerProfile_df,resultados_etl_CutomerProfileCollection, write_concerns)
results_CustomerProfile_df.sample(5)

Unnamed: 0,Registros,Tiempo,CPU,Memoria
15,15000,0.032,35.0,56.9
225,225000,0.039,37.5,56.7
67,67000,0.059,34.2,56.8
341,341000,0.068,32.5,56.6
826,826000,0.011,42.9,57.4


# CurrentAccountCollection ETL Load

In [56]:
#Borramos los valores actuales de la colección
connection["tfm_mongo_database"].CurrentAccountCollection.drop()
print('Borrada colleción CurrentAccountCollection')

Borrada colleción CurrentAccountCollection


In [57]:
CurrentAccountCollection.count_documents({})

0

In [58]:
registers_CurrentAccount = loadCollectionDataFrame(CurrentAccountCollection_df,CurrentAccountCollection)

In [59]:
CurrentAccountCollection.count_documents({})

1000000

In [60]:
data = registers_CurrentAccount
results_CurrentAccount_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
write_concerns = 3
save_results_to_csv(results_CurrentAccount_df,resultados_etl_CurrentAccountCollection, write_concerns)
results_CurrentAccount_df.sample(5)

Unnamed: 0,Registros,Tiempo,CPU,Memoria
9633,9633000,0.002,0.0,57.0
2970,2970000,0.003,0.0,57.2
3884,3884000,0.006,50.0,57.1
11,11000,0.004,0.0,57.5
2336,2336000,0.002,0.0,57.3


## PositionKeepingCollection ETL Load

In [62]:
#Borramos los valores actuales de la colección
connection["tfm_mongo_database"].PositionKeepingCollection.drop()
print('Borrada colleción PositionKeepingCollection')

Borrada colleción PositionKeepingCollection


In [63]:
PositionKeepingCollection.count_documents({})

0

In [64]:
registers_PositionKeeping = loadCollectionDataFrame(PositionKeepingCollection_df,PositionKeepingCollection)

In [65]:
PositionKeepingCollection.count_documents({})

1000000

In [66]:
data = registers_PositionKeeping
results_pk_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
write_concerns = 3
save_results_to_csv(results_pk_df,resultados_etl_PositionKeepingCollection,write_concerns)
results_pk_df.sample(5)

Unnamed: 0,Registros,Tiempo,CPU,Memoria
5323,5323000,0.006,25.0,57.6
2872,2872000,0.008,25.0,57.2
4468,4468000,0.003,66.7,57.3
4657,4657000,0.006,40.0,57.6
6620,6620000,0.005,0.0,57.8


## Metricas con Write Concern = 2 (2 nodos de replica inmediata)

In [67]:
write_concerns = 2

#Conexion unica a colecciones
connection = MongoClient('localhost', 27017, w=write_concerns) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

#Creamos una base de datos para el trabajo llamada TFM
tfm_mongo_db = connection["tfm_mongo_database"]

In [69]:
#Borramos los valores actuales de la colección
connection["tfm_mongo_database"].CustomerProfileCollection.drop()
print('Borrada colleción CustomerProfileCollection')
connection["tfm_mongo_database"].CurrentAccountCollection.drop()
print('Borrada colleción CurrentAccountCollection')
connection["tfm_mongo_database"].PositionKeepingCollection.drop()
print('Borrada colleción PositionKeepingCollection')

Borrada colleción CustomerProfileCollection
Borrada colleción CurrentAccountCollection
Borrada colleción PositionKeepingCollection


In [70]:
registers = loadCollectionDataFrame(CustomerProfileCollection_df,CustomerProfileCollection)
data = registers
results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
save_results_to_csv(results_df,resultados_etl_CutomerProfileCollection, write_concerns)

In [71]:
registers = loadCollectionDataFrame(CurrentAccountCollection_df,CurrentAccountCollection)
data = registers
results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
save_results_to_csv(results_df,resultados_etl_CurrentAccountCollection, write_concerns)

In [72]:
registers = loadCollectionDataFrame(PositionKeepingCollection_df,PositionKeepingCollection)
data = registers
results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
save_results_to_csv(results_df,resultados_etl_PositionKeepingCollection, write_concerns)

##  Metricas con Write Concern = 1 (sin replica inmediata)


In [73]:
write_concerns = 1

#Conexion unica a colecciones
connection = MongoClient('localhost', 27017, w=write_concerns) #Conexion con WriteConcern a 3 (primario y dos nodos consolidados)

#Creamos una base de datos para el trabajo llamada TFM
tfm_mongo_db = connection["tfm_mongo_database"]

In [74]:
#Borramos los valores actuales de la colección
connection["tfm_mongo_database"].CustomerProfileCollection.drop()
print('Borrada colleción CustomerProfileCollection')
connection["tfm_mongo_database"].CurrentAccountCollection.drop()
print('Borrada colleción CurrentAccountCollection')
connection["tfm_mongo_database"].PositionKeepingCollection.drop()
print('Borrada colleción PositionKeepingCollection')

Borrada colleción CustomerProfileCollection
Borrada colleción CurrentAccountCollection
Borrada colleción PositionKeepingCollection


In [75]:
registers = loadCollectionDataFrame(CustomerProfileCollection_df,CustomerProfileCollection)
data = registers
results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
save_results_to_csv(results_df,resultados_etl_CutomerProfileCollection, write_concerns)

In [76]:
registers = loadCollectionDataFrame(CurrentAccountCollection_df,CurrentAccountCollection)
data = registers
results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
save_results_to_csv(results_df,resultados_etl_CurrentAccountCollection, write_concerns)

In [None]:
registers = loadCollectionDataFrame(PositionKeepingCollection_df,PositionKeepingCollection)
data = registers
results_df = pd.DataFrame(data, columns =['Registros', 'Tiempo', 'CPU','Memoria'])
save_results_to_csv(results_df,resultados_etl_PositionKeepingCollection, write_concerns)

In [None]:
#Cierre de la conexion
connection.close()