
# Serverless ETL Pipeline using AWS Lambda, Glue Jobs, Athena

@author: Glad Nayak <br>
@email: gladn94@gmail.com

## 0: Prerequisites

In [None]:
# install required packages in custom environment
!pip install -q boto3 awscli aiohttp yarl log4p pyspark findspark

# initialize spark
from IPython.display import clear_output 
import os
import findspark
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
findspark.init()

In [None]:
# setting environment variables
%env GH_BUCKET_NAME=github-activitiy-gb
%env AWS_PROFILE=glad
%env AWS_DEFAULT_REGION=ap-south-1

env: GH_BUCKET_NAME=github-activitiy-gb
env: AWS_PROFILE=glad
env: AWS_DEFAULT_REGION=ap-south-1


In [None]:
# configure cli access to AWS account 
!aws configure --profile glad
clear_output()

In [None]:
!aws s3 ls ${GH_BUCKET_NAME} --profile glad

                           PRE landing/


## 1: Extract Data

In [1]:
%%writefile extract_data.py
from aiohttp import ClientSession
import asyncio
import os
import yarl
import boto3

import pandas as pd
from datetime import datetime as dt
from datetime import timedelta as td

async def upload_file(session: ClientSession, s3_client: boto3.client, key: str) -> dict:
    """
    Downloads file from Github archive and uploads it to AWS S3 bucket in async way
    S3 bucket name is set in environment variable 
    @session aiohttp.ClientSession object
    @s3_client client to connect with AWS S3
    @key filename to download from Github archive
    """
    BUCKET_NAME = os.environ.get('GH_BUCKET_NAME')

    url = yarl.URL(f'https://data.gharchive.org/{key}.json.gz', encoded=True)

    async with session.get(url, allow_redirects=False) as response:

        try:
            print('[extract] downloading data from', response.url)
            stream_bytes = await response.read()

            print('[extract] loading data to bucket', BUCKET_NAME)
            
            s3_client.put_object(
                Bucket=BUCKET_NAME,
                Key=f'landing/{key}.json.gz',
                Body=stream_bytes,

        )
        except Exception as e:
            print(e)
            return False

    return True

async def main():

    START_DATE_RANGE = os.environ.setdefault('START_DATE_RANGE', '2021-01-01')
    END_DATE_RANGE = os.environ.setdefault('END_DATE_RANGE', '2021-01-03')
    boto3.setup_default_session(profile_name='glad')
    
    filenames = [
                dt.strftime(dr + td(hours=hour),'%Y-%m-%d-%-H') 
                for hour in range(1, 25) 
                for dr in pd.date_range(START_DATE_RANGE, END_DATE_RANGE)
            ]

    s3_client = boto3.client('s3')
    async with ClientSession() as session:
        tasks = [upload_file(session, s3_client, filename) for filename in filenames]
        results = await asyncio.gather(*tasks)
        print(f'[extract] Uploaded {sum(result for result in results if type(result) == bool)} files successfully')

asyncio.run(main())

Writing extract_data.py


In [None]:
!python extract_data.py

[extract] downloading data from https://data.gharchive.org/2021-01-02-3.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-02-4.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-03-4.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-01-1.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-03-2.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-01-12.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-03-8.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-02-2.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-01-14.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-01-7.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-01-8.json.gz
[extract] downloading data from https://data.gharchive.org/2021-01-03-9.json.gz
[extract] downloading data from https:

### Incremental Loading of Data into S3 using AWS Lambda

* You need to ensure that all the 3rd party libraries which are supposed to be deployed along with lambda functions are downloaded to a folder. In our case it is libs.

* We need to go to the folder to build the zip file. Make sure the zip file is created in the base directory of the project and update the zip file with source code.

* ```
rm ghactivity-downloader.zip # remove current zip
cd libs
zip -r ../ghactivity-downloader.zip .
cd ..
zip -g ghactivity-downloader.zip lambda_function.py download.py
```

* We can upload the zip file to AWS Lambda console and validate successfully. Make sure to increase memory size appropriately.

In [2]:
!mkdir -p aws_lambda

In [3]:
%%writefile aws_lambda/download.py
import requests

def download_file(file):
  res = requests.get(f'https://data.gharchive.org/{file}')
  return res

Writing aws_lambda/download.py


In [4]:
%%writefile aws_lambda/upload.py

import boto3
 
def get_client():
  return boto3.client('s3')
 
 
def upload_s3(body, bucket, file):
  s3_client = get_client()
  res = s3_client.put_object(
    Bucket=bucket,
    Key=file,
    Body=body
  )
  return res

Writing aws_lambda/upload.py


In [5]:
%%writefile aws_lambda/utils.py
from datetime import datetime as dt
from datetime import timedelta as td
import requests, boto3, os
from botocore.errorfactory import ClientError


def get_client():
    """
    Returns AWS S3 client
    """
    return boto3.client('s3')


def get_prev_filename(bucket, file_prefix, bookmark_file, baseline_file):
    """
    Get previous filename by reading bookmark file from S3
    If bookmark file doesn't exist yet, use the baseline file
    """
    s3_client = get_client()
    try:
        bookmark_file = s3_client.get_object(
            Bucket=bucket,
            Key=f'{file_prefix}/{bookmark_file}'
        )

        prev_file = bookmark_file['Body'].read().decode('utf-8')
    
    except ClientError as e:
        if e.response['Error']['Code'] == 'NoSuchKey':
            prev_file = baseline_file
        else:
            raise e

    return prev_file


def get_next_filename(prev_file):
    """
    Get next filename by adding one hour to prev_file date
    """
    dt_part = prev_file.split('.')[0]
    next_dt = dt.strptime(dt_part, '%Y-%M-%d-%H') + td(hours=1)
    next_filename = f"{dt.strftime(next_dt, '%Y-%M-%d-%-H')}.json.gz"
    return next_filename


def upload_bookmark(bucket, file_prefix, bookmark_file, bookmark_contents):
    """
    Create bookmark file and place it on given S3 bucket
    """
    s3_client = get_client()
    s3_client.put_object(
        Bucket=bucket,
        Key=f'{file_prefix}/{bookmark_file}',
        Body=bookmark_contents.encode('utf-8')
    )

Writing aws_lambda/utils.py


In [6]:
%%writefile aws_lambda/lambda_function.py
import os
import boto3
from download import download_file
from upload import upload_s3
from utils import get_prev_filename, get_next_filename

 
def lambda_handler(event, context):
    # Run the code in current environment to get appropriate permissions
    environ = os.environ.get('ENVIRON')
    if environ == 'DEV':
        print(f'Running in {environ} environment')
        os.environ.setdefault('AWS_PROFILE', 'glad')

    # Get the environment variables
    bucket = os.environ.get('BUCKET_NAME')
    file_prefix = os.environ.get('FILE_PREFIX')
    bookmark_file = os.environ.get('BOOKMARK_FILE')
    baseline_file = os.environ.get('BASELINE_FILE')

    # download and incrementally upload file to S3
    while True:
        prev_filename = get_prev_filename(bucket, file_prefix, bookmark_file, baseline_file)
        filename = get_next_filename(prev_filename)
        download_res = download_file(filename)
        if download_res.status_code == 404:
            print(f'Invalid file name or downloads caught up till {prev_filename}')
            break

        upload_res = upload_s3(
            download_res.content,
            bucket,
            f'{file_prefix}/{filename}'
        )
        print(f'File {filename} successfully processed')

        # update bookmark
        upload_bookmark(bucket, file_prefix, bookmark_file, filename)

    return upload_res

Writing aws_lambda/lambda_function.py


In [7]:
!pip install requests -t aws_lambda/libs

Collecting requests
  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.3 MB/s 
[?25hCollecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 11.3 MB/s 
[?25hCollecting charset-normalizer~=2.0.0
  Downloading charset_normalizer-2.0.12-py3-none-any.whl (39 kB)
Collecting certifi>=2017.4.17
  Downloading certifi-2021.10.8-py2.py3-none-any.whl (149 kB)
[K     |████████████████████████████████| 149 kB 46.3 MB/s 
[?25hCollecting idna<4,>=2.5
  Downloading idna-3.3-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 6.4 MB/s 
[?25hInstalling collected packages: urllib3, idna, charset-normalizer, certifi, requests
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires request

In [8]:
%%writefile aws_lambda/pack_code.sh
#!/bin/sh

# zip required code files along with dependencies 
CURRENT_DIR=$(pwd)
cd libs
zip -r ../github-activity-lambda.zip .
cd ${CURRENT_DIR}
zip -g github-activity-lambda.zip lambda_function.py download.py upload.py utils.py

Writing aws_lambda/pack_code.sh


In [None]:
%cd /content/aws_lambda
!sh pack_lambda_code.sh
%cd /content

/content/aws_lambda
  adding: requests/ (stored 0%)
  adding: requests/exceptions.py (deflated 67%)
  adding: requests/_internal_utils.py (deflated 51%)
  adding: requests/structures.py (deflated 62%)
  adding: requests/cookies.py (deflated 73%)
  adding: requests/help.py (deflated 71%)
  adding: requests/status_codes.py (deflated 60%)
  adding: requests/__pycache__/ (stored 0%)
  adding: requests/__pycache__/hooks.cpython-37.pyc (deflated 35%)
  adding: requests/__pycache__/__init__.cpython-37.pyc (deflated 41%)
  adding: requests/__pycache__/status_codes.cpython-37.pyc (deflated 47%)
  adding: requests/__pycache__/cookies.cpython-37.pyc (deflated 60%)
  adding: requests/__pycache__/certs.cpython-37.pyc (deflated 29%)
  adding: requests/__pycache__/models.cpython-37.pyc (deflated 54%)
  adding: requests/__pycache__/__version__.cpython-37.pyc (deflated 26%)
  adding: requests/__pycache__/exceptions.cpython-37.pyc (deflated 65%)
  adding: requests/__pycache__/auth.cpython-37.pyc (deflat

### Automate using Events Bridge
events bridge -> create rules -> GH-hourly -> every 60 mints -> lambda function -> function name -> create

Monitor using Cloudwatch

## 2: Preprocessing using Spark

In [9]:
!mkdir -p spark

In [10]:
%%writefile spark/utils.py

from pyspark.sql import SparkSession
 

def get_spark_session(env, app_name):
    """
    returns spark session
    """
    # local cluster
    if env == 'DEV':
        spark = SparkSession. \
            builder. \
            master('local'). \
            appName(app_name). \
            getOrCreate()
        return spark

    # production cluster with yarn as resource manager
    elif env == 'PROD':
        spark = SparkSession. \
            builder. \
            master('yarn'). \
            appName(app_name). \
            getOrCreate()
        return spark

    return

Writing spark/utils.py


In [11]:
%%writefile spark/read.py

def from_files(spark, data_dir, file_pattern, file_format):
    """
    Reads files in given directory and returns spark dataframe
    @spark spark session object
    @data_dir directory to read files from
    @file_pattern prefix for files
    @file_format one of csv, json, or parquet
    @returns spark dataframe
    """
    df = spark. \
        read. \
        format(file_format). \
        load(f'{data_dir}/{file_pattern}')
    return df

Writing spark/read.py


In [12]:
%%writefile spark/transform.py

from pyspark.sql.functions import year, \
    month, dayofmonth
 
 
def transform(df):
    """
    augment dataframe with year, month, day columns
    """
    return df.withColumn('year', year('created_at')). \
        withColumn('month', month('created_at')). \
        withColumn('day', dayofmonth('created_at'))


Writing spark/transform.py


In [13]:
%%writefile spark/write.py

def to_files(df, tgt_dir, file_format):
    """
    writes dataframe to target directory
    @df dataframe to write
    @tgt_dir target directory
    @file_format one of csv, json or parquet
    """
    df.coalesce(16). \
        write. \
        partitionBy('year', 'month', 'day'). \
        mode('append'). \
        format(file_format). \
        save(tgt_dir)


Writing spark/write.py


In [14]:
%%writefile spark/app.py
import os
from utils import get_spark_session
from read import from_files
from transform import transform
from write import to_files
 
 
def main():
    """
    main driver program
    """
    # set environment variables
    env = os.environ.get('ENVIRON')
    src_dir = os.environ.get('SRC_DIR')
    file_pattern = f"{os.environ.get('SRC_FILE_PATTERN')}-*"
    src_file_format = os.environ.get('SRC_FILE_FORMAT')
    tgt_dir = os.environ.get('TGT_DIR')
    tgt_file_format = os.environ.get('TGT_FILE_FORMAT')

    # create spark session
    spark = get_spark_session(env, 'GitHub Activity - Reading Data')

    # read files
    df = from_files(spark, src_dir, file_pattern, src_file_format)

    # preprocess and transform
    df_transformed = transform(df)

    # store transformed dataframe
    to_files(df_transformed, tgt_dir, tgt_file_format)

    df_transformed.printSchema()
    df_transformed.select('repo.*').show()
 
 
if __name__ == '__main__':
    main()

Writing spark/app.py


### Deploy using Spark Client Mode

In [15]:
%%writefile spark/spark-run.sh
#!/bin/sh

# set environment variables
# make sure SRC_DIR and TGT_DIR are correct before running
export ENVIRON=PROD
export SRC_DIR=/user/${USER}/github-activity-gb/landing/ghactivity
export SRC_FILE_FORMAT=json
export TGT_DIR=/user/${USER}/github-activity-gb/raw/ghactivity
export TGT_FILE_FORMAT=parquet
 
export PYSPARK_PYTHON=python3

# running for day 1
export SRC_FILE_PATTERN=2022-04-01
 
spark2-submit --master yarn \
    --py-files ghactivity.zip \
    app.py
 
# running for day 2
export SRC_FILE_PATTERN=2022-04-02
 
spark2-submit --master yarn \
    --py-files ghactivity.zip \
    app.py
 
# running for day 3
export SRC_FILE_PATTERN=2022-04-03
 
spark2-submit --master yarn \
    --py-files ghactivity.zip \
    app.py



Writing spark/spark-run.sh


In [16]:
%%writefile spark/test.py
from pyspark.sql.functions import to_date
from utils import get_spark_session
import getpass

username = getpass.getuser()

env = os.environ.get('ENVIRON')
spark = get_spark_session(env, 'GitHub Activity - Reading Data')

 
src_file_path = f'/user/{username}/github-activity/landing/ghactivity'
src_df = spark.read.json(src_file_path)
src_df.printSchema()
src_df.show()
src_df.count()

src_df.groupBy(to_date('created_at').alias('created_at')).count().show()
 
tgt_file_path = f'/user/{username}/github-activity/raw/ghactivity'
tgt_df = spark.read.parquet(tgt_file_path)
tgt_df.printSchema()
tgt_df.show()
tgt_df.count()
tgt_df.groupBy('year', 'month', 'day').count().show()

Writing spark/test.py


### Deploy using Spark Cluster Mode

```
spark-submit \
    --master yarn \
    --deploy-mode cluster \
    --conf "spark.yarn.appMasterEnv.ENVIRON=PROD" \
    --conf "spark.yarn.appMasterEnv.SRC_DIR=/user/hadoop/prod/landing/ghactivity" \
    --conf "spark.yarn.appMasterEnv.SRC_FILE_FORMAT=json" \
    --conf "spark.yarn.appMasterEnv.TGT_DIR=/user/hadoop/prod/raw/ghactivity/" \
    --conf "spark.yarn.appMasterEnv.TGT_FILE_FORMAT=parquet" \
    --conf "spark.yarn.appMasterEnv.SRC_FILE_PREFIX=2021-01-15 \
    --py-files ghactivity.zip\
    app.py

```

### Deploy using AWS EMR Step Executions

## 3: Automate ETL using AWS Glue

1.   First, we create Glue crawler to the S3 landing folder. 

2.   Attach a custom policy to the  AWSGlueServiceRole-GitHub role we created during crawler, with read, write, delete permissions to the bucket where data is stored.
<br> We will also generate Spark Logs generated by GitHub related Glue Jobs in the same bucket. 
```
{
    "Version": "2012-10-17",
        "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "s3:*Object"
            ],
            "Resource": [
                "arn:aws:s3:::github-activity-gb/*"
            ]
        }
        ]
}
```




### Glue Spark Job to Partition data by Date column and Store in Parquet format

### Spark History Server UI

* When we submit Glue Jobs, it uses Spark under the hood. We need to access Spark UI to troubleshoot some of the issues.

* There are several ways to access Spark UI. One of the approaches is to use Docker Container which contains Spark UI Server.

* You can set up a local Docker based Spark UI Server using these instructions from AWS Glue Samples GitHub repository. Make sure to clone the repository before running any docker commands as the image is not available under docker hub.

 

```
git clone https://github.com/aws-samples/aws-glue-samples
cd aws-glue-samples/utilities/Spark_UI
docker build -t glue/sparkui:latest .
```

Once the container is built make sure to add required environment variables as [mentioned in the instructions before](https://github.com/aws-samples/aws-glue-samples/tree/master/utilities/Spark_UI#start-the-spark-history-server) starting the docker container with Spark UI Server.

* Set LOG_DIR by replacing s3a://path_to_eventlog with your event log directory

* Set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY with your valid AWS credentials.

In [18]:
!mkdir -p glue

In [19]:
%%writefile glue/preprocess_job.py
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.functions import date_format, substring
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
 
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
 
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
 
datasource0 = glueContext. \
  create_dynamic_frame. \
  from_catalog(
    database = "itvghlandingdb",
    table_name = "ghactivitycsv",
    transformation_ctx = "datasource0"
  )
 
df = datasource0. \
  toDF(). \
  withColumn('year', date_format(substring('created_at', 1, 10), 'yyyy')). \
  withColumn('month', date_format(substring('created_at', 1, 10), 'MM')). \
  withColumn('day', date_format(substring('created_at', 1, 10), 'dd'))
 
dyf = DynamicFrame.fromDF(dataframe=df, glue_ctx=glueContext, name="dyf")
 
datasink4 = glueContext. \
  write_dynamic_frame. \
  from_options(frame=dyf,
    connection_type="s3",
    connection_options={"path": "s3://github-activity-gb/raw/ghactivity/",
      "compression": "snappy",
      "partitionKeys": ["year", "month", "day"]},
    format="glueparquet",
    transformation_ctx="datasink4")
 
job.commit()

Writing glue/preprocess_job.py


### Validate using Athena

 Once the table structure is refreshed, we can validate using Athena by running some standard validation queries.

1. Get the number of records from the table.

 `SELECT count(1) FROM githubdb.ghactivity;`

2. Get the number of new repositories added.

 ```
SELECT count(1), count(distinct repo.id) FROM githubdb.ghactivity
WHERE type = 'CreateEvent'
AND payload.ref_type = 'repository';
```

3. Preview repo related details using repo column of type struct.
 ```
SELECT repo FROM githubdb.ghactivity
WHERE type = 'CreateEvent'
AND payload.ref_type = 'repository'
LIMIT 10;
```

4. Get the number of repositories created for each of the 3 days.
```
SELECT substr(created_at, 1, 10), count(1), count(distinct id)
FROM githubdb.ghactivity
WHERE type = 'CreateEvent'
AND payload.ref_type = 'repository'
GROUP BY substr(created_at, 1, 10);
```

 Here are some of the observations.
 *   As parquet is columnar storage, the performance will be relatively better compared to JSON.
 *   The compression rate with respect to snappy might be lower than compression rate with gzip against JSON.





### Perform Incremental ETL using Glue Bookmark

We can start by cleaning up previous data, and start over.

```
aws s3 ls s3://github-activity-gb/raw/ --profile glad
aws s3 ls s3://github-activity-gb/raw/ghactivity/ --profile glad
aws s3 rm s3://github-activity-gb/raw/ --recursive --profile glad
```

We can enable Glue Bookmart either at Job level or at Run level(which will be active only for that specific run).
After running again, we can validate data using Athena queries again.

* Listing jobs
```
aws glue list-jobs \
    --profile glad \
    --region ap-south-1
    ```

Get job details
```
aws glue \
    get-job \
        --job-name github_json_to_parquet \
        --profile glad \
        --region ap-south-1
```
Get job run ids. The latest one will be typically at top.
```
aws glue \
    get-job-runs \
        --job-name github_json_to_parquet \
        --profile glad \
        --region ap-south-1
```
Get job run details to verify if job is successful or not.
```
aws glue \
    get-job-run \
        --job-name github_json_to_parquet \
        --run-id jr_a350197ce2d5cc3168160813e28bef293e0edd4fc2fe8f458191885d0bb32f96 \
        --profile glad \
        --region ap-south-1
```
Get job bookmark details. This information will be used to read the data in incremental fashion in subsequent runs. Make sure to keep track of it to compare with subsequent runs.
```
aws glue \
    get-job-bookmark \
        --job-name github_json_to_parquet \
        --profile glad \
        --region ap-south-1
```
We can use reset-job-bookmark to reset remove the bookmark. It comes handy to start the jobs from the beginning. We can also reset to a particular run using run id.

```
aws glue reset-job-bookmark \
    --job-name github_json_to_parquet \
    --profile glad \
    --region ap-south-1
```


### Validate Data using Athena Queries
We run the same athena queries as earlier and compare the results as well as performance.

When we recrawl the table, we might run into a known issue with Athena. Refer to [this document](https://docs.amazonaws.cn/en_us/athena/latest/ug/updates-and-partitions.html&sa=D&source=editors&ust=1629529826660000&usg=AOvVaw2KKPih8iFjZ9ykQ8RgKtGs) about the details.

We run the below scripts to drop the partitions and add them using Athena.

```
ALTER TABLE githubrawdb.ghactivity
DROP PARTITION (year = '2021', month = '01', day = '16');
 
MSCK REPAIR TABLE githubrawdb.ghactivity;
```

Get the number of records from the table.
```
SELECT count(1) FROM ghactivity;
```

Get the number of new repositories added.
```
SELECT count(1), count(distinct repo.id) FROM ghactivity
WHERE type = 'CreateEvent'
AND payload.ref_type = 'repository';
```

Preview repo related details using repo column of type struct.
```
SELECT repo FROM ghactivity
WHERE type = 'CreateEvent'
AND payload.ref_type = 'repository'
LIMIT 10;
```

Get the number of repositories created for each of the 4 days. Make sure to compare with previous runs to ensure that counts do not change too much.
```
SELECT substr(created_at, 1, 10), count(1), count(distinct id) FROM ghactivity
WHERE type = 'CreateEvent'
AND payload.ref_type = 'repository'
GROUP BY substr(created_at, 1, 10);
```

In [25]:
!!pip install pipreqs > /dev/null
!pipreqs .

INFO: Successfully saved requirements file in ./requirements.txt


In [None]:
!zip -r code2.zip aws_lambda glue spark extract_data.py

In [24]:
!unzip -l code2.zip

Archive:  code2.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
        0  2022-05-10 07:41   aws_lambda/
      221  2022-05-10 07:40   aws_lambda/upload.py
     1374  2022-05-10 07:40   aws_lambda/lambda_function.py
        0  2022-05-10 07:40   aws_lambda/libs/
        0  2022-05-10 07:40   aws_lambda/libs/idna-3.3.dist-info/
        4  2022-05-10 07:40   aws_lambda/libs/idna-3.3.dist-info/INSTALLER
     1523  2022-05-10 07:40   aws_lambda/libs/idna-3.3.dist-info/LICENSE.md
     9765  2022-05-10 07:40   aws_lambda/libs/idna-3.3.dist-info/METADATA
     1457  2022-05-10 07:40   aws_lambda/libs/idna-3.3.dist-info/RECORD
        5  2022-05-10 07:40   aws_lambda/libs/idna-3.3.dist-info/top_level.txt
       92  2022-05-10 07:40   aws_lambda/libs/idna-3.3.dist-info/WHEEL
        0  2022-05-10 07:40   aws_lambda/libs/certifi/
        0  2022-05-10 07:40   aws_lambda/libs/certifi/__pycache__/
     1095  2022-05-10 07:40   aws_lambda/libs/certifi/__pycache__/core.cpyt