In [6]:
import boto3
import sagemaker

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

glue = boto3.Session().client(service_name='glue', region_name=region)

# Must update the TrustPolicy of the notebook as follows:

```
    {
      "Sid": "",
      "Effect": "Allow",
      "Principal": {
        "Service": "glue.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
```

In [13]:
create_response = glue.create_crawler(
    Name='amazon_reviews_crawler',
    Role=role,
    DatabaseName='dsoaws',
    Description='Amazon Customer Reviews Dataset Crawler',
    Targets={
        'CatalogTargets': [
            {
                'DatabaseName': 'dsoaws',
                'Tables': [
                    'amazon_reviews_tsv',
                ]
            }
        ]
    },
    Schedule='cron(59 23 * * ? *)', # run every night at 23:59 UTC
     SchemaChangePolicy={
         'DeleteBehavior': 'LOG' # |'DELETE_FROM_DATABASE'|'DEPRECATE_IN_DATABASE'
     },
     RecrawlPolicy={
         'RecrawlBehavior': 'CRAWL_EVERYTHING' # 'CRAWL_NEW_FOLDERS_ONLY' for S3 Targets
     }
)

In [14]:
from pprint import pprint
pprint(create_response)

{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
                                      'content-length': '2',
                                      'content-type': 'application/x-amz-json-1.1',
                                      'date': 'Sat, 26 Dec 2020 22:50:31 GMT',
                                      'x-amzn-requestid': '7cb34ad1-6074-4c4e-92f8-74456b68a99e'},
                      'HTTPStatusCode': 200,
                      'RequestId': '7cb34ad1-6074-4c4e-92f8-74456b68a99e',
                      'RetryAttempts': 0}}


# TODO:  Copy a .tar.gz file from s3://amazon-reviews-pds/tsv/ to s3://{}/amazon-reviews-pds/tsv/

# TODO:  Invoke the crawler, wait for the crawler to finish

# TODO:  Query Athena and verify the new data shows up.  
Note:  Make sure this works with new partitions of data.  We might need to run that MSCK command to pick up the new partitions (if the Crawler isn't doing this already.)