<a href="https://colab.research.google.com/github/cyberthreatgurl/GmailPDFExtractor/blob/main/GmailPDFExtractor_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries for Colab authentication and Google APIs
from google.colab import auth
import google.auth
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseUpload
import base64
import io

In [2]:
# --- SETTINGS ---
# The name of the folder in Google Drive where PDFs will be saved.
DRIVE_FOLDER_NAME = "Gmail PDF Attachments"


In [4]:
def get_or_create_folder(drive_service, folder_name):
    """Checks if a folder exists in Google Drive and returns its ID. Creates it if not found."""
    try:
        # Search for the folder
        query = f"mimeType='application/vnd.google-apps.folder' and name='{folder_name}' and trashed=false"
        response = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
        files = response.get('files', [])

        if files:
            # Folder exists, return the ID of the first one found
            print(f"Folder '{folder_name}' already exists. ID: {files[0].get('id')}")
            return files[0].get('id')
        else:
            # Folder does not exist, create it
            print(f"Folder '{folder_name}' not found. Creating it...")
            file_metadata = {
                'name': folder_name,
                'mimeType': 'application/vnd.google-apps.folder'
            }
            folder = drive_service.files().create(body=file_metadata, fields='id').execute()
            print(f"Folder created successfully. ID: {folder.get('id')}")
            return folder.get('id')
    except HttpError as error:
        print(f"An error occurred: {error}")
        return None

def run_script():
    """
    Main function to connect to Gmail, find emails with PDF attachments,
    and upload them to a specified Google Drive folder.
    """
    # Step 1: Authenticate the user in the Colab environment.
    auth.authenticate_user()
    creds, project = google.auth.default()

    try:
        # Build the service objects for Gmail and Drive using the Colab credentials
        gmail_service = build("gmail", "v1", credentials=creds)
        drive_service = build("drive", "v3", credentials=creds)

        # Get the ID of the destination folder in Google Drive
        folder_id = get_or_create_folder(drive_service, DRIVE_FOLDER_NAME)
        if not folder_id:
            print("Could not find or create the Google Drive folder. Exiting.")
            return

        # Search for ALL emails with PDF attachments
        query = "has:attachment filename:pdf"
        result = gmail_service.users().messages().list(userId="me", q=query).execute()
        messages = result.get("messages", [])

        if not messages:
            print("No emails with PDF attachments found. ✅")
            return

        print(f"Found {len(messages)} email(s) with PDF attachments. Processing...")

        for msg in messages:
            msg_id = msg['id']
            message = gmail_service.users().messages().get(userId="me", id=msg_id).execute()

            # Skip if the email has already been processed (by checking for the UNREAD label)
            if 'UNREAD' not in message['labelIds']:
                print(f"Skipping already processed email ID {msg_id}.")
                continue

            payload = message.get('payload', {})
            parts = payload.get('parts', [])

            for part in parts:
                filename = part.get('filename')
                if filename and filename.lower().endswith('.pdf'):
                    attachment_id = part['body'].get('attachmentId')
                    attachment = gmail_service.users().messages().attachments().get(
                        userId="me", messageId=msg_id, id=attachment_id
                    ).execute()

                    file_data = base64.urlsafe_b64decode(attachment['data'].encode('UTF-8'))

                    media_body = io.BytesIO(file_data)
                    file_metadata = {
                        'name': filename,
                        'parents': [folder_id]
                    }

                    print(f"  -> Uploading '{filename}' to Google Drive...")
                    drive_service.files().create(
                        body=file_metadata,
                        media_body=MediaIoBaseUpload(media_body, mimetype='application/pdf')
                    ).execute()
                    print(f"  -> Successfully uploaded '{filename}'.")

            # Mark the email as read by removing the 'UNREAD' label
            gmail_service.users().messages().modify(
                userId="me", id=msg_id, body={'removeLabelIds': ['UNREAD']}
            ).execute()
            print(f"-> Marked email ID {msg_id} as read to prevent re-processing.\n")

    except HttpError as error:
        print(f"An error occurred: {error}")



In [5]:
# --- Execute the script ---
run_script()

An error occurred: <HttpError 401 when requesting https://www.googleapis.com/drive/v3/files?q=mimeType%3D%27application%2Fvnd.google-apps.folder%27+and+name%3D%27Gmail+PDF+Attachments%27+and+trashed%3Dfalse&spaces=drive&fields=files%28id%2C+name%29&alt=json returned "Request had invalid authentication credentials. Expected OAuth 2 access token, login cookie or other valid authentication credential. See https://developers.google.com/identity/sign-in/web/devconsole-project.". Details: "[{'message': 'Invalid Credentials', 'domain': 'global', 'reason': 'authError', 'location': 'Authorization', 'locationType': 'header'}]">
Could not find or create the Google Drive folder. Exiting.
