# Synthetic data generation example

We create synthetic files with the following function definition:

```Python
import json
import urllib.parse
import boto3

s3 = boto3.client('s3')

def lambda_handler(event, context):
    size = 1024*1024*16
    index = event['index']
    nb_parts = 64
    key = f'synthetic/pattern-1gb/file-{index:03}'
    bucket = 'cloudfuse-taxi-data'
    
    create_up_resp = s3.create_multipart_upload(
        Bucket=bucket,
        Key=key,
    )    
    
    pattern = b''.join([(ch%256).to_bytes(1, 'big') for ch in range(size)])
    parts = []
    for i in range(nb_parts):
        up_res = s3.upload_part(
            Body=pattern,
            Bucket=bucket,
            ContentLength=size,
            Key=key,
            PartNumber=i+1,
            UploadId=create_up_resp['UploadId'],
        )
        parts.append({'ETag': up_res['ETag'], 'PartNumber': i+1})
    
    s3.complete_multipart_upload(
        Bucket=bucket,
        Key=key,
        UploadId=create_up_resp['UploadId'],
        MultipartUpload={
        'Parts': parts
        }, 
    )   

```

We then invoke this function repeatedly to create a complete dataset of test files:

In [1]:
import boto3
import json
import base64
from joblib import Parallel, delayed
import os

region_name="us-east-2"
binary_name="lambda"
aws_profile=os.environ["AWS_PROFILE"] # Specify the profile you want to use from your .aws/credentials file with the AWS_PROFILE env variable

def invoke_function(index, show_logs = False):
    session = boto3.Session(profile_name=aws_profile)
    client = session.client('lambda', region_name = region_name)
    inputParams = {
        'index': index,
    }
    response = client.invoke(
        FunctionName = "synth-file",
        InvocationType = 'RequestResponse',
        Payload = json.dumps(inputParams),
        LogType='Tail' if show_logs else 'None'
    )
    if show_logs:
        print(base64.b64decode(response['LogResult']).decode("utf-8") )
    return json.load(response['Payload'])


nb_file = 100
res = Parallel(n_jobs=50)(delayed(invoke_function)(i) for i in range(nb_file))