In [1]:
import boto3
import pickle


def get_creds():
    sm_client = boto3.client(
        'secretsmanager',
        region_name='us-east-1'
    )

    secret_token = sm_client.get_secret_value(SecretId='gmail_token')['SecretBinary']
    creds = pickle.loads(secret_token)
    return creds

In [2]:
from googleapiclient.discovery import build

creds = get_creds()
service = build('gmail', 'v1', credentials=creds)

In [3]:
users = service.users()

In [4]:
dynamodb = boto3.resource('dynamodb')

In [5]:
table = dynamodb.Table('gmail_jobs')

In [6]:
item = {
    'job_id': 'gmail_ingest',
    'job_description': 'Ingest data from gmail to s3',
    'is_active': 'Y',
    'baseline_days': 45
}

In [7]:
table.put_item(Item=item)

{'ResponseMetadata': {'RequestId': 'DJFFKGVR9GONHIOOUHTC3QA74FVV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Sat, 14 May 2022 08:13:35 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'DJFFKGVR9GONHIOOUHTC3QA74FVV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '2745614147'},
  'RetryAttempts': 0}}

In [8]:
def get_job_details(job_name):
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('gmail_jobs')
    job_details = table.get_item(Key={'job_id': job_name})['Item']
    return job_details

In [9]:
import datetime
import time

def get_job_run_time_range(job_details):
    job_start_time = int(time.mktime(datetime.datetime.now().timetuple()))
    if job_details.get('job_run_bookmark_details'):
        job_run_bookmark_details = job_details.get('job_run_bookmark_details')
        last_run_start_time_epoch = int(job_run_bookmark_details['last_run_start_time_epoch'])
        last_run_end_time_epoch = int(job_run_bookmark_details['last_run_end_time_epoch'])
        last_run_diff = datetime.datetime.now().date() - datetime.datetime.fromtimestamp(last_run_end_time_epoch).date()
        if last_run_diff.days > 1:
            start_time_epoch = last_run_end_time_epoch
            end_time = datetime.datetime.fromtimestamp(last_run_end_time_epoch).date() + datetime.timedelta(days=1)
            end_time_epoch = int(time.mktime(end_time.timetuple()))
        else:
            start_time_epoch = last_run_end_time_epoch
            end_time_epoch = int(time.mktime(datetime.datetime.now().timetuple()))  
    else:
        baseline_days = int(job_details['baseline_days'])
        start_time = datetime.datetime.now().date() - datetime.timedelta(days=int(baseline_days))
        end_time = start_time + datetime.timedelta(days=1)
        start_time_epoch = int(time.mktime(start_time.timetuple()))
        end_time_epoch = int(time.mktime(end_time.timetuple()))
    return job_start_time, start_time_epoch, end_time_epoch

In [10]:
job_details = get_job_details('gmail_ingest')

In [11]:
job_start_time, start_time_epoch, end_time_epoch = get_job_run_time_range(job_details)
(job_start_time, start_time_epoch, end_time_epoch)

(1652516060, 1648578600, 1648665000)

In [12]:
import boto3

sm_client = boto3.client(
    'secretsmanager',
    region_name='us-east-1'
)

secret_token = sm_client.get_secret_value(SecretId='gmail_token')['SecretBinary']

In [13]:
import pickle

creds = pickle.loads(secret_token)

from googleapiclient.discovery import build

service = build('gmail', 'v1', credentials=creds)

In [18]:
def get_message_ids(start_time_epoch, end_time_epoch):
    message_ids = []
    next_page_token = None

    users = service.users()
    while True:
        if next_page_token:
            print(f'Processing in range between {start_time_epoch} and {end_time_epoch} using token {next_page_token}')
            messages = users. \
                messages(). \
                list(
                    userId='me', 
                    q=f'after:{start_time_epoch} before:{end_time_epoch}',
                    pageToken=next_page_token
                ). \
                execute()
            message_ids += messages['messages']
            next_page_token = messages.get('nextPageToken')
        else:
            print(f'Processing in range between {start_time_epoch} and {end_time_epoch}')
            messages = users. \
                messages(). \
                list(
                    userId='me', 
                    q=f'after:{start_time_epoch} before:{end_time_epoch}'
                ). \
                execute()
            message_ids = messages['messages']
            next_page_token = messages.get('nextPageToken')
        if next_page_token == None:
            break
    return message_ids

In [19]:
import pandas as pd

def get_messages(message_ids):
    messages = []
    for message_id in message_ids:
        message = users.messages().get(userId='me', id=message_id['id']).execute()
        messages.append(message)
    return pd.DataFrame(messages)

In [20]:
import uuid


def write_messages_to_s3(messages_df, s3_bucket, s3_prefix):
    messages_df.to_json(f's3://{s3_bucket}/{s3_prefix}/part-{uuid.uuid1()}.json', orient='records', lines=True)
    print(f'Successfully saved messages to s3://{s3_bucket}/{s3_prefix}/part-{uuid.uuid1()}.json')

In [21]:
message_ids = get_message_ids(start_time_epoch, end_time_epoch)

Processing in range between 1648578600 and 1648665000


In [22]:
messages = get_messages(message_ids)

In [1]:
!aws s3 rm s3://itversitydata/messages --recursive

delete: s3://itversitydata/messages/part-0caec0ea-d440-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-2d270ce4-d43e-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-06440a4a-d45d-11ec-ae97-27afa2619490.json
delete: s3://itversitydata/messages/part-1d0c068a-d442-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-10e0fb32-d441-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-26ecd208-d440-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-06fa5106-d43f-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-3077ac54-d43f-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-0c533d08-d43e-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-1f8c1af6-d43f-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-343f9e9e-d441-11ec-b424-3e22fbd03f7b.json
delete: s3://itversitydata/messages/part-3e4c2adc-d442-11ec-b424-3e22fbd03f7b.json
dele

In [24]:
write_messages_to_s3(messages, 'itversitydata', 'messages')

Successfully saved messages to s3://itversitydata/messages/part-8f762d52-d35e-11ec-a08e-3e22fbd03f7b.json


In [25]:
!aws s3 ls s3://itversitydata/messages/

2022-05-14 13:49:14    3925149 part-892cd202-d35e-11ec-a08e-3e22fbd03f7b.json
