In [1]:
import sys
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

from timeit import default_timer as timer

MONGODB_HOST = "192.168.0.15"
MONGODB_PORT = "27017"
MAX_WORKERS = 4
ITERATIONS = 2

In [2]:
arr_collections_debug = ["100","500","1000","5000","10000","50000","100000","500000","1000000"]
arr_collections = ["100","500","1000","5000","10000","50000","100000","500000","1000000","5000000","8000000"]

df = None
#.config("spark.shuffle.service.enabled", True) \
#.config("spark.dynamicAllocation.enabled", True) \
#.config("spark.dynamicAllocation.minExecutors", MAX_WORKERS) \
spark = SparkSession.builder \
    .appName("io_tests_mongodb") \
    .master("spark://spark:7077") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()
sqlContext = SQLContext(spark)
    
def getCollectionReadURL(collection):
    return "mongodb://" + MONGODB_HOST + ":" + MONGODB_PORT + "/yelp_filtered_read." + collection + "?ssl=false"

def getCollectionWriteURL(collection):
    return "mongodb://" + MONGODB_HOST + ":" + MONGODB_PORT + "/yelp_filtered_write." + collection + "?ssl=false"

def readFromCollection(url):
    return spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", url).load()
    
def writeToCollection(df, url):
    df.write.format("mongo").mode("append").option("uri", url).save()
    

## `TESTE DE LEITURA DOS ARQUIVOS`

In [3]:
firstRun = True
arr_collection_sizes = {}
arr_collection_read_timings = {}
for collection in arr_collections_debug:
    arr_sizes = []
    arr_timings = []    
    for i in range(30):
        
        url = getCollectionReadURL(collection)
        
        starttime = timer()
        df = readFromCollection(url)
        count = df.count()
        endtime = timer()
        
        if(firstRun):
            print("Ignored: " + str(endtime-starttime))
            firstRun = False
            starttime = timer()
            df = readFromCollection(url)
            count = df.count()
            endtime = timer()
        
        size = sys.getsizeof(df)
        
        arr_sizes.append(size)
        arr_timings.append(endtime-starttime)
        print("["+"{:02d}".format(i+1)+"] "+ str(count) + ": " + str(round(endtime-starttime, 3)) + " segundos")
        
    arr_collection_sizes[collection] = str(np.mean(arr_sizes)) + " kb"
    arr_collection_read_timings[collection] = str(round(np.mean(arr_timings), 3)) + " segundos"

Ignored: 11.437923399999988
[01] 100000: 5.394 segundos
[02] 100000: 4.17 segundos
[03] 100000: 4.281 segundos
[04] 100000: 3.835 segundos
[05] 100000: 3.842 segundos
[06] 100000: 3.979 segundos
[07] 100000: 4.238 segundos
[08] 100000: 4.218 segundos
[09] 100000: 3.815 segundos
[10] 100000: 3.849 segundos
[11] 100000: 3.784 segundos
[12] 100000: 3.725 segundos
[13] 100000: 3.928 segundos
[14] 100000: 3.807 segundos
[15] 100000: 3.731 segundos
[16] 100000: 3.909 segundos
[17] 100000: 3.892 segundos
[18] 100000: 3.796 segundos
[19] 100000: 3.915 segundos
[20] 100000: 3.702 segundos
[21] 100000: 3.845 segundos
[22] 100000: 3.878 segundos
[23] 100000: 3.988 segundos
[24] 100000: 3.879 segundos
[25] 100000: 3.789 segundos
[26] 100000: 3.739 segundos
[27] 100000: 3.8 segundos
[28] 100000: 3.905 segundos
[29] 100000: 4.016 segundos
[30] 100000: 3.827 segundos


## `TESTE DE ESCRITA DOS ARQUIVOS`

In [4]:
firstRun = True
arr_collection_write_timings = {}
for collection in arr_collections_debug:
    urlRead = getCollectionReadURL(collection)
    df = readFromCollection(urlRead)
    count = df.count()
    arr_timings = []    
    for i in range(30):
        url = getCollectionWriteURL(collection)
        
        starttime = timer()
        writeToCollection(df, url)
        endtime = timer()
        
        if(firstRun):
            print("Ignored: " + str(endtime-starttime))
            firstRun = False
            starttime = timer()
            writeToCollection(df, url)
            endtime = timer()
        
        arr_timings.append(endtime-starttime)
        print("["+"{:02d}".format(i+1)+"] "+ str(count) + ": " + str(round(endtime-starttime, 3)) + " segundos")
        
    arr_collection_write_timings[collection] = str(round(np.mean(arr_timings), 3)) + " segundos"

Ignored: 12.791313099999911
[01] 100000: 10.992 segundos
[02] 100000: 10.816 segundos
[03] 100000: 10.351 segundos
[04] 100000: 10.513 segundos
[05] 100000: 10.287 segundos
[06] 100000: 9.778 segundos
[07] 100000: 9.427 segundos
[08] 100000: 9.873 segundos
[09] 100000: 9.115 segundos
[10] 100000: 9.889 segundos
[11] 100000: 8.759 segundos
[12] 100000: 9.894 segundos
[13] 100000: 9.841 segundos
[14] 100000: 9.938 segundos
[15] 100000: 8.861 segundos
[16] 100000: 8.15 segundos
[17] 100000: 9.808 segundos
[18] 100000: 10.002 segundos
[19] 100000: 9.112 segundos
[20] 100000: 10.257 segundos
[21] 100000: 10.279 segundos
[22] 100000: 10.603 segundos
[23] 100000: 10.19 segundos
[24] 100000: 10.111 segundos
[25] 100000: 10.212 segundos
[26] 100000: 9.895 segundos
[27] 100000: 9.136 segundos
[28] 100000: 9.328 segundos
[29] 100000: 9.723 segundos
[30] 100000: 9.907 segundos


## `MÉDIA DO TEMPO DE LEITURA DOS ARQUIVOS`

In [5]:
arr_collection_read_timings

{'100000': '3.949 segundos'}

## `MÉDIA DO TEMPO DE ESCRITA DOS ARQUIVOS`

In [6]:
arr_collection_write_timings

{'100000': '9.835 segundos'}

## `MÉDIA DO TAMANHO DOS ARQUIVOS`

In [7]:
arr_collection_sizes

{'100000': '56.0 kb'}