<a href="https://colab.research.google.com/github/cyberthreatgurl/GmailPDFExtractor/blob/main/GmailPDFExtractor_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install google-auth-oauthlib==0.4.4

Collecting google-auth-oauthlib==0.4.4
  Downloading google_auth_oauthlib-0.4.4-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading google_auth_oauthlib-0.4.4-py2.py3-none-any.whl (18 kB)
Installing collected packages: google-auth-oauthlib
  Attempting uninstall: google-auth-oauthlib
    Found existing installation: google-auth-oauthlib 1.2.2
    Uninstalling google-auth-oauthlib-1.2.2:
      Successfully uninstalled google-auth-oauthlib-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.29.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.4 which is incompatible.[0m[31m
[0mSuccessfully installed google-auth-oauthlib-0.4.4


In [2]:
# Import necessary libraries
import os.path
import base64
import io
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseUpload

In [3]:
# --- THIS SCOPES VARIABLE IS NOW THE SOURCE OF TRUTH ---
# It explicitly tells Google which permissions to ask for.
SCOPES = [
    "https://www.googleapis.com/auth/gmail.readonly",
    "https://www.googleapis.com/auth/drive.file",
    "https://www.googleapis.com/auth/gmail.modify"
]

# The name of the folder in Google Drive where PDFs will be saved.
DRIVE_FOLDER_NAME = "Gmail PDF Attachments"

In [7]:
def explicit_authenticate():
    """
    Performs an explicit authentication flow using credentials.json,
    perfect for forcing the correct scopes in Colab.
    """
    creds = None
    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            # This uses your uploaded credentials.json
            flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
            # This runs an auth flow designed for command-line/notebook environments
            creds = flow.run_console()
        with open("token.json", "w") as token:
            token.write(creds.to_json())
    return creds

# The rest of the script is the same, but it will now use the credentials
# we generated above, which are guaranteed to have the correct permissions.

def get_or_create_folder(drive_service, folder_name):
    """Checks if a folder exists in Google Drive and returns its ID. Creates it if not found."""
    try:
        query = f"mimeType='application/vnd.google-apps.folder' and name='{folder_name}' and trashed=false"
        response = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
        files = response.get('files', [])
        if files:
            print(f"Folder '{folder_name}' already exists. ID: {files[0].get('id')}")
            return files[0].get('id')
        else:
            print(f"Folder '{folder_name}' not found. Creating it...")
            file_metadata = {'name': folder_name, 'mimeType': 'application/vnd.google-apps.folder'}
            folder = drive_service.files().create(body=file_metadata, fields='id').execute()
            print(f"Folder created successfully. ID: {folder.get('id')}")
            return folder.get('id')
    except HttpError as error:
        print(f"An error occurred: {error}")
        return None

def run_script():
    """Main function to run the process."""
    # Use our new explicit authentication function
    creds = explicit_authenticate()
    try:
        gmail_service = build("gmail", "v1", credentials=creds)
        drive_service = build("drive", "v3", credentials=creds)

        folder_id = get_or_create_folder(drive_service, DRIVE_FOLDER_NAME)
        if not folder_id:
            print("Could not find or create the Google Drive folder. Exiting.")
            return

        # ... (The rest of the email processing logic is identical)
        query = "has:attachment filename:pdf"
        result = gmail_service.users().messages().list(userId="me", q=query).execute()
        messages = result.get("messages", [])

        if not messages:
            print("No emails with PDF attachments found. ✅")
            return

        print(f"Found {len(messages)} email(s) with PDF attachments. Processing...")
        # ... (rest of the code is the same)
        for msg in messages:
            msg_id = msg['id']
            message = gmail_service.users().messages().get(userId="me", id=msg_id).execute()
            # the following three lines need to be un-commented out to being
            # extracting only new messages.
            #if 'UNREAD' not in message['labelIds']:
            #    print(f"Skipping already processed email ID {msg_id}.")
            #    continue
            payload = message.get('payload', {})
            parts = payload.get('parts', [])
            for part in parts:
                filename = part.get('filename')
                if filename and filename.lower().endswith('.pdf'):
                    attachment_id = part['body'].get('attachmentId')
                    attachment = gmail_service.users().messages().attachments().get(userId="me", messageId=msg_id, id=attachment_id).execute()
                    file_data = base64.urlsafe_b64decode(attachment['data'].encode('UTF-8'))
                    media_body = io.BytesIO(file_data)
                    file_metadata = {'name': filename, 'parents': [folder_id]}
                    print(f"  -> Uploading '{filename}' to Google Drive...")
                    drive_service.files().create(body=file_metadata, media_body=MediaIoBaseUpload(media_body, mimetype='application/pdf')).execute()
                    print(f"  -> Successfully uploaded '{filename}'.")
            gmail_service.users().messages().modify(userId="me", id=msg_id, body={'removeLabelIds': ['UNREAD']}).execute()
            print(f"-> Marked email ID {msg_id} as read to prevent re-processing.\n")

    except HttpError as error:
        print(f"An error occurred: {error}")

In [None]:
# --- Execute the script ---
run_script()

Folder 'Gmail PDF Attachments' already exists. ID: 1kD6rE0SdYVkz2ZRonpy9xFeuGbVkGj0F
Found 100 email(s) with PDF attachments. Processing...
  -> Uploading 'home-assistant-cloud-invoice-2025-08-20.pdf' to Google Drive...
  -> Successfully uploaded 'home-assistant-cloud-invoice-2025-08-20.pdf'.
-> Marked email ID 198c5449477eb688 as read to prevent re-processing.

  -> Uploading 'INV0001705300.pdf' to Google Drive...
  -> Successfully uploaded 'INV0001705300.pdf'.
-> Marked email ID 198bf3604e0487fe as read to prevent re-processing.

  -> Uploading 'adrian-shaw-resume-22-July-2025-cybersecurity-manager.pdf' to Google Drive...
  -> Successfully uploaded 'adrian-shaw-resume-22-July-2025-cybersecurity-manager.pdf'.
-> Marked email ID 198baee1b6985136 as read to prevent re-processing.

  -> Uploading 'ri92-19.pdf' to Google Drive...
  -> Successfully uploaded 'ri92-19.pdf'.
  -> Uploading 'ri92-19a.pdf' to Google Drive...
  -> Successfully uploaded 'ri92-19a.pdf'.
  -> Uploading 'SHAW- ADRIA