In [1]:
!aws s3 rm s3://itversitydata/messages --recursive

delete: s3://itversitydata/messages/part-257533cd-d4aa-11ec-9629-7b9a7ee0bd59.json
delete: s3://itversitydata/messages/part-0bbbaaf6-d4b5-11ec-8a45-e79808170361.json
delete: s3://itversitydata/messages/part-2bab2b1d-d4af-11ec-b64e-e79808170361.json
delete: s3://itversitydata/messages/part-0ea44930-d4ae-11ec-a61f-e79808170361.json
delete: s3://itversitydata/messages/part-02dced21-d4b0-11ec-a5b7-e79808170361.json
delete: s3://itversitydata/messages/part-1ff26f5a-d4b1-11ec-88cb-e79808170361.json
delete: s3://itversitydata/messages/part-1d0129a2-d4ac-11ec-a98c-e79808170361.json
delete: s3://itversitydata/messages/part-2ac49af3-d4b6-11ec-b598-e79808170361.json
delete: s3://itversitydata/messages/part-14ce9e2d-d4b3-11ec-afac-e79808170361.json
delete: s3://itversitydata/messages/part-31d38dcf-d4c0-11ec-9475-350bffe5e5eb.json
delete: s3://itversitydata/messages/part-331bba4b-d4b4-11ec-9428-e79808170361.json
delete: s3://itversitydata/messages/part-35dee65e-d4b9-11ec-a6f6-e79808170361.json
dele

In [2]:
import boto3

dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('gmail_jobs')
table.delete_item(Key={'job_id': 'gmail_jobs'})
item = {
    'job_id': 'gmail_ingest',
    'job_description': 'Ingest data from gmail to s3',
    'is_active': 'Y',
    'baseline_days': 45
}
table.put_item(Item=item)

{'ResponseMetadata': {'RequestId': 'MK3AP85BV0DI25SVU5NNQB5R3FVV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Mon, 16 May 2022 06:35:58 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'MK3AP85BV0DI25SVU5NNQB5R3FVV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '2745614147'},
  'RetryAttempts': 0}}

In [3]:
import boto3
import pickle


def get_creds():
    sm_client = boto3.client(
        'secretsmanager',
        region_name='us-east-1'
    )

    secret_token = sm_client.get_secret_value(SecretId='gmail_token')['SecretBinary']
    creds = pickle.loads(secret_token)
    return creds

In [4]:
from googleapiclient.discovery import build


def get_users():
    creds = get_creds()
    service = build('gmail', 'v1', credentials=creds)
    return service.users()

In [5]:
def get_job_details(job_name):
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('gmail_jobs')
    job_details = table.get_item(Key={'job_id': job_name})['Item']
    return job_details

In [6]:
import datetime
import time

def get_job_run_time_range(job_details):
    job_start_time = int(time.mktime(datetime.datetime.now().timetuple()))
    if job_details.get('job_run_bookmark_details'):
        job_run_bookmark_details = job_details.get('job_run_bookmark_details')
        last_run_start_time_epoch = int(job_run_bookmark_details['last_run_start_time_epoch'])
        last_run_end_time_epoch = int(job_run_bookmark_details['last_run_end_time_epoch'])
        last_run_diff = datetime.datetime.now().date() - datetime.datetime.fromtimestamp(last_run_end_time_epoch).date()
        if last_run_diff.days > 1:
            start_time_epoch = last_run_end_time_epoch
            end_time = datetime.datetime.fromtimestamp(last_run_end_time_epoch).date() + datetime.timedelta(days=1)
            end_time_epoch = int(time.mktime(end_time.timetuple()))
        else:
            start_time_epoch = last_run_end_time_epoch
            end_time_epoch = int(time.mktime(datetime.datetime.now().timetuple()))  
    else:
        baseline_days = int(job_details['baseline_days'])
        start_time = datetime.datetime.now().date() - datetime.timedelta(days=int(baseline_days))
        end_time = start_time + datetime.timedelta(days=1)
        start_time_epoch = int(time.mktime(start_time.timetuple()))
        end_time_epoch = int(time.mktime(end_time.timetuple()))
    return job_start_time, start_time_epoch, end_time_epoch

In [7]:
def get_message_ids(start_time_epoch, end_time_epoch):
    message_ids = []
    next_page_token = None

    users = get_users()
    while True:
        if next_page_token:
            print(f'Processing in range between {start_time_epoch} and {end_time_epoch} using token {next_page_token}')
            messages = users. \
                messages(). \
                list(
                    userId='me', 
                    q=f'after:{start_time_epoch} before:{end_time_epoch}',
                    pageToken=next_page_token
                ). \
                execute()
            message_ids += messages['messages']
            next_page_token = messages.get('nextPageToken')
        else:
            print(f'Processing in range between {start_time_epoch} and {end_time_epoch}')
            messages = users. \
                messages(). \
                list(
                    userId='me', 
                    q=f'after:{start_time_epoch} before:{end_time_epoch}'
                ). \
                execute()
            message_ids = messages['messages']
            next_page_token = messages.get('nextPageToken')
        if next_page_token == None:
            break
    return message_ids

In [8]:
import pandas as pd

def get_messages(message_ids):
    users = get_users()
    messages = []
    for message_id in message_ids:
        message = users.messages().get(userId='me', id=message_id['id']).execute()
        messages.append(message)
    return pd.DataFrame(messages)

In [16]:
import uuid


def write_messages_to_s3(messages_df, s3_bucket, s3_prefix):
    messages_df.to_json(f's3://{s3_bucket}/{s3_prefix}/part-{uuid.uuid1()}.json', orient='records', lines=True)
    print(f'Successfully saved messages to s3://{s3_bucket}/{s3_prefix}/part-{uuid.uuid1()}.json')

In [17]:
def save_job_run_details(job_details, job_start_time, message_ids, start_time_epoch, end_time_epoch, file_name):
    dynamodb = boto3.resource('dynamodb')
    message_count = len(message_ids)
    max_message_id = max([message_id['id'] for message_id in message_ids])
    job_run_details_item = {
        'job_id': job_details['job_id'],
        'job_run_time': job_start_time,
        'job_run_bookmark_details': {
            'max_message_id': max_message_id,
            'start_time_epoch': start_time_epoch,
            'end_time_epoch': end_time_epoch
        },
        'rows_processed': message_count,
        'file_name': file_name
    }
    job_run_details_table = dynamodb.Table('gmail_job_run_details')
    job_run_details_table.put_item(Item=job_run_details_item)
    
    job_details_table = dynamodb.Table('gmail_jobs')
    job_details['job_run_bookmark_details'] = {
        'last_run_max_message_id': max_message_id,
        'last_run_start_time_epoch': start_time_epoch,
        'last_run_end_time_epoch': end_time_epoch 
    }
    job_details_table.put_item(Item=job_details)

In [18]:
job_details = get_job_details('gmail_ingest')

In [19]:
job_start_time, start_time_epoch, end_time_epoch = get_job_run_time_range(job_details)
(job_start_time, start_time_epoch, end_time_epoch)

(1652683161, 1648751400, 1648837800)

In [20]:
message_ids = get_message_ids(start_time_epoch, end_time_epoch)

Processing in range between 1648751400 and 1648837800


In [21]:
messages = get_messages(message_ids)

In [22]:
file_name = write_messages_to_s3(messages, 'itversitydata', 'messages')

Successfully saved messages to s3://itversitydata/messages/part-059e77fc-d4e3-11ec-8d5b-3e22fbd03f7b.json


In [23]:
save_job_run_details(job_details, job_start_time, message_ids, start_time_epoch, end_time_epoch, file_name)

In [24]:
!aws s3 ls s3://itversitydata/messages/

2022-05-16 12:09:56    4349311 part-fef7d1d2-d4e2-11ec-8d5b-3e22fbd03f7b.json


In [25]:
job_details = get_job_details('gmail_ingest')
job_start_time, start_time_epoch, end_time_epoch = get_job_run_time_range(job_details)
message_ids = get_message_ids(start_time_epoch, end_time_epoch)
messages = get_messages(message_ids)
file_name = write_messages_to_s3(messages, 'itversitydata', 'messages')
save_job_run_details(job_details, job_start_time, message_ids, start_time_epoch, end_time_epoch, file_name)

Processing in range between 1648837800 and 1648924200
Successfully saved messages to s3://itversitydata/messages/part-1a5ee5b4-d4e3-11ec-8d5b-3e22fbd03f7b.json


In [26]:
!aws s3 ls s3://itversitydata/messages/

2022-05-16 12:10:32    3074469 part-1415c2d6-d4e3-11ec-8d5b-3e22fbd03f7b.json
2022-05-16 12:09:56    4349311 part-fef7d1d2-d4e2-11ec-8d5b-3e22fbd03f7b.json
