# Explore how S3 scales

The objective of this notebook is to check how AWS S3 behaves when the same object chunk is downloaded from many AWS Lambdas in parallel.

The data used here can be generated thanks to the `data_generation` notebook.

In [4]:
import os

region_name="us-east-2"
binary_name="lambda"
aws_profile=os.environ["AWS_PROFILE"] # Specify the profile you want to use from your .aws/credentials file with the AWS_PROFILE env variable

MEGA = 1024*1024

In [7]:
lambda_name = !docker run \
    --rm \
    -v $HOME/.aws/credentials:/creds:ro \
    -v cloud-reader-tf:/mnt/state_vol \
    cloudfuse/cloud-reader-terraform output lambda_arn
lambda_name = lambda_name[0][1:len(lambda_name[0])-1]
print('lambda_name:', lambda_name)

lambda_name: arn:aws:lambda:us-east-2:615900053518:function:cloud-reader-benchmark


In [8]:
from joblib import Parallel, delayed
import boto3
import json
import base64
import pandas as pd

def q90(x):
    return x.quantile(0.9)
def q99(x):
    return x.quantile(0.99)

def invoke_lambda(index):
    session = boto3.Session(profile_name=aws_profile)
    client = session.client('lambda', region_name = region_name)
    inputParams = {
        "region": region_name,
        "bucket": "cloudfuse-taxi-data",
        "key": f"synthetic/pattern-1gb/file{index:03}",
        "size": 1024*1024*1024,
        "ranges": [{"start": 0, "length": 500*MEGA}],
        "max_parallel": 16,
        "initial_permits": 1,
        "release_rate": 2,
        
    }
    response = client.invoke(
        FunctionName = lambda_name,
        InvocationType = 'RequestResponse',
        Payload = json.dumps(inputParams),
        LogType='None'
    )
    return json.load(response['Payload'])

def download_multiple_files(parallelism):
    return  Parallel(n_jobs=parallelism)(delayed(invoke_lambda)(i) for i in range(parallelism))

def download_single_files(parallelism):
    return  Parallel(n_jobs=parallelism)(delayed(invoke_lambda)(0) for i in range(parallelism))

def show_bench(results):
    downloads = []
    for res in results:
        for dl in res['cache_stats']:
            downloads.append({
                "dl_duration": dl['dl_duration'],
                "first_read": res['range_durations'][0],
                "last_read": res['range_durations'][-1],
            })

    df = pd.DataFrame(downloads)

    return df.agg({
        'dl_duration': 'mean', 
        'first_read': ['mean', q90, q99], 
        'last_read': ['mean', q90, q99]
    })


In [9]:
!date
res_multiple_files = download_multiple_files(100)
show_bench(res_multiple_files)

Mon Aug  9 12:09:13 CEST 2021


Unnamed: 0,dl_duration,first_read,last_read
mean,6095.63,6095.74,6095.74
q90,,6559.1,6559.1
q99,,6958.53,6958.53


In [10]:
!date
res_single_files = download_single_files(100)
show_bench(res_single_files)

Mon Aug  9 12:10:01 CEST 2021


Unnamed: 0,dl_duration,first_read,last_read
mean,5865.04,5865.12,5865.12
q90,,6602.3,6602.3
q99,,7034.49,7034.49


Performances are very close whether we download the same file or different files from our ~100 Lambda containers.