## Create Glue Catalog Tables for GMail Messages

As data is ready, let us go ahead and create Glue Catalog Table using Glue Crawler. We will also validate whether the table is created or not.

* Glue Crawler can crawl existing files and create or update Glue Catalog Table.
* Create Glue Crawler using the following information.
    * Crawler: **GMail Crawler**
    * Database: **gmail_db**
    * Table Name: **messages**
* Once the crawler is created run using AWS Web Console.

In [3]:
import boto3

In [4]:
glue_client = boto3.client('glue')

In [5]:
glue_client.list_crawlers()

{'CrawlerNames': ['Flights Data Crawler',
  'Flights Parquet Data Crawler',
  'GHActivity Landing Crawler',
  'GHActivity Raw Crawler',
  'GHRepos Landing Crawler',
  'GMail Crawler',
  'ITV Retail DB JSON Crawler',
  'ITV Retail DB Parquet Crawler',
  'Retail Crawler',
  'TR Flights CSV Crawler',
  'TR Flights Parquet Crawler',
  'TR GHActivity Landing Crawler',
  'TR GHActivity Raw Crawler',
  'TR Retail Crawler',
  'Yelp Dataset JSON Crawler',
  'Yelp Predictions Crawler'],
 'ResponseMetadata': {'RequestId': '61ab6dc3-e264-4230-87af-9a3e28c66b7b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Mon, 16 May 2022 04:23:42 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '435',
   'connection': 'keep-alive',
   'x-amzn-requestid': '61ab6dc3-e264-4230-87af-9a3e28c66b7b'},
  'RetryAttempts': 0}}

In [8]:
crawlers = glue_client.list_crawlers()['CrawlerNames']
crawlers

['Flights Data Crawler',
 'Flights Parquet Data Crawler',
 'GHActivity Landing Crawler',
 'GHActivity Raw Crawler',
 'GHRepos Landing Crawler',
 'GMail Crawler',
 'ITV Retail DB JSON Crawler',
 'ITV Retail DB Parquet Crawler',
 'Retail Crawler',
 'TR Flights CSV Crawler',
 'TR Flights Parquet Crawler',
 'TR GHActivity Landing Crawler',
 'TR GHActivity Raw Crawler',
 'TR Retail Crawler',
 'Yelp Dataset JSON Crawler',
 'Yelp Predictions Crawler']

In [10]:
glue_client.get_databases()

{'DatabaseList': [{'Name': 'bls_db',
   'Description': '',
   'LocationUri': 'dbfs:/user/hive/warehouse/bls_db.db',
   'Parameters': {},
   'CreateTime': datetime.datetime(2021, 4, 11, 22, 47, 39, tzinfo=tzlocal()),
   'CreateTableDefaultPermissions': [{'Principal': {'DataLakePrincipalIdentifier': 'IAM_ALLOWED_PRINCIPALS'},
     'Permissions': ['ALL']}],
   'CatalogId': '582845781536'},
  {'Name': 'default',
   'Description': 'Default Hive database',
   'LocationUri': 'hdfs://ip-172-31-93-65.ec2.internal:8020/user/hive/warehouse',
   'CreateTime': datetime.datetime(2018, 2, 7, 0, 36, 18, tzinfo=tzlocal()),
   'CreateTableDefaultPermissions': [{'Principal': {'DataLakePrincipalIdentifier': 'IAM_ALLOWED_PRINCIPALS'},
     'Permissions': ['ALL']}],
   'CatalogId': '582845781536'},
  {'Name': 'flights-db',
   'CreateTime': datetime.datetime(2021, 1, 23, 12, 9, 32, tzinfo=tzlocal()),
   'CreateTableDefaultPermissions': [{'Principal': {'DataLakePrincipalIdentifier': 'IAM_ALLOWED_PRINCIPALS'},

In [12]:
database_list = glue_client.get_databases()['DatabaseList']

In [13]:
database_names = [database['Name'] for database in database_list]

In [14]:
database_names

['bls_db',
 'default',
 'flights-db',
 'genlogsdb',
 'gmail_db',
 'itv_retail_db_json',
 'itvghlandingdb',
 'itvghrawdb',
 'jh_db',
 'myretail',
 'retail_db',
 'sagemaker_data_wrangler',
 'sagemaker_processing',
 'sampledb',
 'tr-flights-db',
 'trghlandingdb',
 'trghrawdb',
 'yelp_json_db',
 'yelp_predictions']

In [None]:
glue_client.get_tables?

In [16]:
glue_client.get_tables(DatabaseName='gmail_db')

{'TableList': [{'Name': 'messages',
   'DatabaseName': 'gmail_db',
   'Owner': 'owner',
   'CreateTime': datetime.datetime(2022, 5, 7, 11, 59, 41, tzinfo=tzlocal()),
   'UpdateTime': datetime.datetime(2022, 5, 7, 11, 59, 41, tzinfo=tzlocal()),
   'LastAccessTime': datetime.datetime(2022, 5, 7, 11, 59, 41, tzinfo=tzlocal()),
   'Retention': 0,
   'StorageDescriptor': {'Columns': [{'Name': 'id', 'Type': 'string'},
     {'Name': 'threadid', 'Type': 'string'},
     {'Name': 'labelids', 'Type': 'array<string>'},
     {'Name': 'snippet', 'Type': 'string'},
     {'Name': 'payload',
      'Type': 'struct<partId:string,mimeType:string,filename:string,headers:array<struct<name:string,value:string>>,body:struct<size:int,data:string>,parts:array<struct<partId:string,mimeType:string,filename:string,headers:array<struct<name:string,value:string>>,body:struct<size:int,data:string>,parts:array<struct<partId:string,mimeType:string,filename:string,headers:array<struct<name:string,value:string>>,body:str

In [19]:
table_list = glue_client.get_tables(DatabaseName='gmail_db')['TableList']

In [20]:
table_names = [table['Name'] for table in table_list]

In [21]:
table_names

['messages']