In [13]:
import base64
import os
import json
import pandas as pd
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials


SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']


class GmailConnector:
    """Gmail API Connector. Interacts with Gmail API."""

    def __init__(self, path: str = 'token.json') -> None:
        self.path = path
        self.service = self._gmail_authenticate()

    def _gmail_authenticate(self):
        """Authenticate to Gmail API."""

        if not os.path.exists(self.path):
            raise EnvironmentError(f"Credentials file: {self.path} not found.")
            
        creds = Credentials.from_authorized_user_file(self.path, SCOPES)
        return build('gmail', 'v1', credentials=creds)

    def list_messages(self, user_id: str = "me", max_results: int = 10):
        """List the last 'max_results' messages of the user's mailbox."""

        response = self.service.users().messages().list(userId=user_id, maxResults=max_results).execute()
        messages = response.get('messages', [])
        return messages

    def process_headers(self, msg_id, user_id: str = "me"):
        # Get the headers of a single message.
        message = self.service.users().messages().get(userId=user_id, id=msg_id, format='metadata').execute()
        headers = message.get('payload', {}).get('headers', [])
        header_info = {}
        for header in headers:
            if header['name'] in ['Subject', 'From', 'Date', 'Content-Type']:
                header_info[header['name']] = header['value']
        return header_info
    
    def get_message_header(self, msg_id, user_id: str = "me"):
        # Get the headers of a single message.
        message = self.service.users().messages().get(userId=user_id, id=msg_id, format='full').execute()
        headers = message.get('payload', {}).get('headers', [])
        return {header['name']: header['value'] for header in headers}
    
    def get_message_body(self, msg_id, user_id: str = "me"):
        """Get the body of a single message - both plain text and html."""
        message = self.service.users().messages().get(userId=user_id, id=msg_id, format='full').execute()
        email_data = message['payload']['parts'][0]['body']['data']

        # Decoding from base64 URL-safe encoded string
        decoded_data = base64.urlsafe_b64decode(email_data).decode('utf-8')
        return decoded_data
    


if __name__ == '__main__':
    gmail = GmailConnector()
    messages = gmail.list_messages()

    emails = []
    for message in messages[:1]:
        header_info = gmail.get_message_header(message['id'])
        emails.append(header_info)
        body = gmail.get_message_body(message['id'])
        print(header_info.keys())

dict_keys(['Delivered-To', 'Received', 'X-Google-Smtp-Source', 'X-Received', 'ARC-Seal', 'ARC-Message-Signature', 'ARC-Authentication-Results', 'Return-Path', 'Received-SPF', 'Authentication-Results', 'DKIM-Signature', 'X-Mailer', 'CFBL-Address', 'To', 'From', 'Subject', 'Date', 'MIME-Version', 'Content-Type', 'Content-Transfer-Encoding', 'X-DKIM-Options', 'X-mid', 'x-virtual-mta', 'Feedback-ID', 'X-250ok-CID', 'List-Unsubscribe', 'List-Unsubscribe-Post', 'Message-ID'])


In [3]:
from pprint import pprint
pprint(header_info)

{'ARC-Authentication-Results': 'i=1; mx.google.com;       dkim=pass '
                               'header.i=@bf57x.hubspotemail.net header.s=hs2 '
                               'header.b=h2ecoCAX;       dkim=pass '
                               'header.i=@acquire.com header.s=hs2-19985776 '
                               'header.b=Kxxyr+z+;       spf=pass (google.com: '
                               'domain of '
                               '1axb4mbutzzpofdbkjjvtgqqirsv0cjdfvxkex-chris+2estudyx=gmail.com@bf57x.hubspotemail.net '
                               'designates 143.244.88.165 as permitted sender) '
                               'smtp.mailfrom="1axb4mbutzzpofdbkjjvtgqqirsv0cjdfvxkex-chris+2Estudyx=gmail.com@bf57x.hubspotemail.net";       '
                               'dmarc=pass (p=NONE sp=NONE dis=NONE) '
                               'header.from=acquire.com',
 'ARC-Message-Signature': 'i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; '
                      

In [16]:
gmail.service.users().messages().get(userId="me", id=message["id"], format='full').execute()

{'id': '18bf887e314b07ca',
 'threadId': '18bf887e314b07ca',
 'labelIds': ['UNREAD', 'IMPORTANT', 'CATEGORY_PERSONAL', 'INBOX'],
 'snippet': 'Your weekly technical digest of top projects, repos, tips and tricks to stay ahead of the curve. \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c',
 'payload': {'partId': '',
  'mimeType': 'multipart/alternative',
  'filename': '',
  'headers': [{'name': 'Delivered-To', 'value': 'chris.studyx@gmail.com'},
   {'name': 'Received',
    'value': 'by 2002:a9a:55ca:0:b0:26f:e434:2e7f with SMTP id z10csp621291lkl;        Wed, 22 Nov 2023 11:35:37 -0800 (PST)'},
   {'name': 'X-Google-Smtp-Source',
    'value': 'AGHT+IF0d6NDOCY4iUkN9UyZCcsvM+gPmRS