In [1]:
import boto3
import pickle


def get_creds():
    sm_client = boto3.client(
        'secretsmanager',
        region_name='us-east-1'
    )

    secret_token = sm_client.get_secret_value(SecretId='gmail_token')['SecretBinary']
    creds = pickle.loads(secret_token)
    return creds

In [2]:
from googleapiclient.discovery import build

creds = get_creds()
service = build('gmail', 'v1', credentials=creds)

In [3]:
users = service.users()

In [4]:
dynamodb = boto3.resource('dynamodb')

In [5]:
table = dynamodb.Table('gmail_jobs')

In [6]:
item = {
    'job_id': 'gmail_ingest',
    'job_description': 'Ingest data from gmail to s3',
    'is_active': 'Y',
    'baseline_days': 45
}

In [7]:
table.put_item(Item=item)

{'ResponseMetadata': {'RequestId': 'NCCBFLM3PE7BJP6ULCCOPN3DDRVV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Sat, 14 May 2022 08:10:04 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'NCCBFLM3PE7BJP6ULCCOPN3DDRVV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '2745614147'},
  'RetryAttempts': 0}}

In [8]:
def get_job_details(job_name):
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('gmail_jobs')
    job_details = table.get_item(Key={'job_id': job_name})['Item']
    return job_details

In [9]:
import datetime
import time

def get_job_run_time_range(job_details):
    job_start_time = int(time.mktime(datetime.datetime.now().timetuple()))
    if job_details.get('job_run_bookmark_details'):
        job_run_bookmark_details = job_details.get('job_run_bookmark_details')
        last_run_start_time_epoch = int(job_run_bookmark_details['last_run_start_time_epoch'])
        last_run_end_time_epoch = int(job_run_bookmark_details['last_run_end_time_epoch'])
        last_run_diff = datetime.datetime.now().date() - datetime.datetime.fromtimestamp(last_run_end_time_epoch).date()
        if last_run_diff.days > 1:
            start_time_epoch = last_run_end_time_epoch
            end_time = datetime.datetime.fromtimestamp(last_run_end_time_epoch).date() + datetime.timedelta(days=1)
            end_time_epoch = int(time.mktime(end_time.timetuple()))
        else:
            start_time_epoch = last_run_end_time_epoch
            end_time_epoch = int(time.mktime(datetime.datetime.now().timetuple()))  
    else:
        baseline_days = int(job_details['baseline_days'])
        start_time = datetime.datetime.now().date() - datetime.timedelta(days=int(baseline_days))
        end_time = start_time + datetime.timedelta(days=1)
        start_time_epoch = int(time.mktime(start_time.timetuple()))
        end_time_epoch = int(time.mktime(end_time.timetuple()))
    return job_start_time, start_time_epoch, end_time_epoch

In [10]:
job_details = get_job_details('gmail_ingest')

In [11]:
job_start_time, start_time_epoch, end_time_epoch = get_job_run_time_range(job_details)
(job_start_time, start_time_epoch, end_time_epoch)

(1652515806, 1648578600, 1648665000)

In [12]:
import boto3

sm_client = boto3.client(
    'secretsmanager',
    region_name='us-east-1'
)

secret_token = sm_client.get_secret_value(SecretId='gmail_token')['SecretBinary']

In [13]:
import pickle

creds = pickle.loads(secret_token)

from googleapiclient.discovery import build

service = build('gmail', 'v1', credentials=creds)

In [14]:
def get_message_ids(start_time_epoch, end_time_epoch):
    message_ids = []
    next_page_token = None

    users = service.users()
    while True:
        if next_page_token:
            print(f'Processing in range between {start_time_epoch} and {end_time_epoch} using token {next_page_token}')
            messages = users. \
                messages(). \
                list(
                    userId='me', 
                    q=f'after:{start_time_epoch} before:{end_time_epoch}',
                    pageToken=next_page_token
                ). \
                execute()
            message_ids += messages['messages']
            next_page_token = messages.get('nextPageToken')
        else:
            print(f'Processing in range between {start_time_epoch} and {end_time_epoch}')
            messages = users. \
                messages(). \
                list(
                    userId='me', 
                    q=f'after:{start_time_epoch} before:{end_time_epoch}'
                ). \
                execute()
            message_ids = messages['messages']
            next_page_token = messages.get('nextPageToken')
        if next_page_token == None:
            break
    return message_ids

In [15]:
message_ids = get_message_ids(start_time_epoch, end_time_epoch)
message_ids

Processing in range between 1648578600 and 1648665000


[{'id': '17fdc0b349836c2f', 'threadId': '17fdc0b349836c2f'},
 {'id': '17fdc0139bb6ef0e', 'threadId': '17fdc0139bb6ef0e'},
 {'id': '17fdbf69c6a845ea', 'threadId': '17fdbf69c6a845ea'},
 {'id': '17fdbd3a5bbc0550', 'threadId': '17fdbd3a5bbc0550'},
 {'id': '17fdbd0a3a385f4c', 'threadId': '17fdbd0a3a385f4c'},
 {'id': '17fdbc81ba38885c', 'threadId': '17fdbc81ba38885c'},
 {'id': '17fdbb8fdc3029e0', 'threadId': '17fdbb8fdc3029e0'},
 {'id': '17fdbb202cec0aec', 'threadId': '17fdbb202cec0aec'},
 {'id': '17fdba6487207633', 'threadId': '17fdba6487207633'},
 {'id': '17fdb9f619d8c36d', 'threadId': '17fdb9f619d8c36d'},
 {'id': '17fdb9ad57f34dbe', 'threadId': '17fdb9ad57f34dbe'},
 {'id': '17fdb8185604211e', 'threadId': '17fdb8185604211e'},
 {'id': '17fdb75066750fce', 'threadId': '17fdb75066750fce'},
 {'id': '17fdb64366fad0c4', 'threadId': '17fdb64366fad0c4'},
 {'id': '17fdb4f1cd3f25a7', 'threadId': '17fdb4f1cd3f25a7'},
 {'id': '17fdb4c53bf4c77b', 'threadId': '17fdb4c53bf4c77b'},
 {'id': '17fdb383418f4b9

In [16]:
messages = []

for message_id in message_ids:
    message = users.messages().get(userId='me', id=message_id['id']).execute()
    messages.append(message)

In [17]:
import pandas as pd

In [18]:
messages_df = pd.DataFrame(messages)

In [19]:
messages_df

Unnamed: 0,id,threadId,labelIds,snippet,payload,sizeEstimate,historyId,internalDate
0,17fdc0b349836c2f,17fdc0b349836c2f,"[UNREAD, IMPORTANT, CATEGORY_UPDATES, INBOX]","Hi Durga Gadiraju, Zoom will be updating your ...","{'partId': '', 'mimeType': 'text/html', 'filen...",10318,11776146,1648664195000
1,17fdc0139bb6ef0e,17fdc0139bb6ef0e,"[CATEGORY_PROMOTIONS, UNREAD, INBOX]",[Click here] to see your new student enrollmen...,"{'partId': '', 'mimeType': 'text/html', 'filen...",62673,11776484,1648663540000
2,17fdbf69c6a845ea,17fdbf69c6a845ea,"[CATEGORY_PROMOTIONS, UNREAD, INBOX]",Company Says 300000 Miles of Roads To Be Mappe...,"{'partId': '', 'mimeType': 'multipart/alternat...",221549,11776483,1648662723000
3,17fdbd3a5bbc0550,17fdbd3a5bbc0550,"[UNREAD, CATEGORY_UPDATES, INBOX]","Apply for this job Hello Durga, Hope you&#39;r...","{'partId': '', 'mimeType': 'text/html', 'filen...",9613,11776145,1648651720000
4,17fdbd0a3a385f4c,17fdbd0a3a385f4c,"[CATEGORY_PROMOTIONS, UNREAD, INBOX]",March Newsletter State of Pantheon Fresh Conte...,"{'partId': '', 'mimeType': 'multipart/alternat...",47715,11776482,1648660292000
...,...,...,...,...,...,...,...,...
68,17fd74cbda3a3723,17fd74cbda3a3723,"[UNREAD, CATEGORY_UPDATES, INBOX]",Get the information you need to send money hom...,"{'partId': '', 'mimeType': 'multipart/alternat...",63299,11764727,1648584603000
69,17fd73a656b405ca,17fd73a656b405ca,"[UNREAD, CATEGORY_FORUMS, INBOX]",Meetup Randy Krum (Organizer) sent a message t...,"{'partId': '', 'mimeType': 'multipart/related'...",122751,11764933,1648583400000
70,17fd71827cb08527,17fd71827cb08527,"[CATEGORY_PROMOTIONS, UNREAD, INBOX]",Get Cash Back at these just-added stores! ‌ ‌ ...,"{'partId': '', 'mimeType': 'multipart/mixed', ...",51900,11764700,1648581157000
71,17fd6fbe4c226ff9,17fd6fbe4c226ff9,"[UNREAD, CATEGORY_UPDATES, INBOX]","ITVersity, Inc., you successfully sent a payme...","{'partId': '', 'mimeType': 'text/html', 'filen...",51281,11764726,1648579305000
