In [1]:
import sys
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

from timeit import default_timer as timer

MONGODB_HOST = "192.168.0.15"
MONGODB_PORT = "27017"
MAX_WORKERS = 4
ITERATIONS = 2

In [2]:
arr_collections_debug = ["5000000","8000000"]
arr_collections = ["100","500","1000","5000","10000","50000","100000","500000","1000000","5000000","8000000"]

df = None
#.config("spark.shuffle.service.enabled", True) \
#.config("spark.dynamicAllocation.enabled", True) \
#.config("spark.dynamicAllocation.minExecutors", MAX_WORKERS) \
spark = SparkSession.builder \
    .appName("io_tests_mongodb") \
    .master("spark://spark:7077") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
    .getOrCreate()
sqlContext = SQLContext(spark)
    
def getCollectionReadURL(collection):
    return "mongodb://" + MONGODB_HOST + ":" + MONGODB_PORT + "/yelp_filtered_read." + collection + "?ssl=false"

def getCollectionWriteURL(collection):
    return "mongodb://" + MONGODB_HOST + ":" + MONGODB_PORT + "/yelp_filtered_write." + collection + "?ssl=false"

def readFromCollection(url):
    return spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", url).load()
    
def writeToCollection(df, url):
    df.write.format("mongo").mode("append").option("uri", url).save()
    

## `TESTE DE LEITURA DOS ARQUIVOS`

In [3]:
firstRun = True
arr_collection_sizes = {}
arr_collection_read_timings = {}
for collection in arr_collections_debug:
    arr_sizes = []
    arr_timings = []    
    for i in range(1):
        
        url = getCollectionReadURL(collection)
        
        starttime = timer()
        df = readFromCollection(url)
        count = df.count()
        endtime = timer()
        
        if(firstRun):
            print("Ignored: " + str(endtime-starttime))
            firstRun = False
            starttime = timer()
            df = readFromCollection(url)
            count = df.count()
            endtime = timer()
        
        size = sys.getsizeof(df)
        
        arr_sizes.append(size)
        arr_timings.append(endtime-starttime)
        print("["+"{:02d}".format(i+1)+"] "+ str(count) + ": " + str(round(endtime-starttime, 3)) + " segundos")
        
    arr_collection_sizes[collection] = str(np.mean(arr_sizes)) + " kb"
    arr_collection_read_timings[collection] = str(round(np.mean(arr_timings), 3)) + " segundos"

Ignored: 220.50992449999995
[01] 5000000: 183.566 segundos
[01] 7991122: 320.446 segundos


## `TESTE DE ESCRITA DOS ARQUIVOS`

In [4]:
firstRun = True
arr_collection_write_timings = {}
for collection in arr_collections_debug:
    urlRead = getCollectionReadURL(collection)
    df = readFromCollection(urlRead)
    count = df.count()
    arr_timings = []    
    for i in range(1):
        url = getCollectionWriteURL(collection)
        
        starttime = timer()
        writeToCollection(df, url)
        endtime = timer()
        
        if(firstRun):
            print("Ignored: " + str(endtime-starttime))
            firstRun = False
            starttime = timer()
            writeToCollection(df, url)
            endtime = timer()
        
        arr_timings.append(endtime-starttime)
        print("["+"{:02d}".format(i+1)+"] "+ str(count) + ": " + str(round(endtime-starttime, 3)) + " segundos")
        
    arr_collection_write_timings[collection] = str(round(np.mean(arr_timings), 3)) + " segundos"

Ignored: 356.838491
[01] 5000000: 299.706 segundos
[01] 7991122: 603.113 segundos


## `MÉDIA DO TEMPO DE LEITURA DOS ARQUIVOS`

In [5]:
arr_collection_read_timings

{'5000000': '183.566 segundos', '8000000': '320.446 segundos'}

## `MÉDIA DO TEMPO DE ESCRITA DOS ARQUIVOS`

In [6]:
arr_collection_write_timings

{'5000000': '299.706 segundos', '8000000': '603.113 segundos'}

## `MÉDIA DO TAMANHO DOS ARQUIVOS`

In [7]:
arr_collection_sizes

{'5000000': '56.0 kb', '8000000': '56.0 kb'}