### Imports

In [None]:
import io
from pathlib import Path
from typing import Optional, Any, List, Dict
import pandas as pd
import numpy as np
import unicodedata, re
from collections import defaultdict

from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build, Resource
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
from PIL import Image, ImageOps


### Constants

In [None]:
# --- Configuration ---
# This tells the script what permissions we need.
SCOPES = ["https://www.googleapis.com/auth/drive"]

# This should be the path to your service account JSON file.
ROOT_DIR = Path.cwd().parent
SERVICE_ACCOUNT_FILE = ROOT_DIR / "secrets" / "secrets.json"
FOLDER_ID = "1uE_WUe82XYWubwBDqUf1avD_qdpbMInZ"
PAGE_SIZE = 100  # Number of results per page
REQUIRED_PDFS = { # Keys = final file name : Values = Keywords to identify files
    'miembrecia': ['miembrecia', 'membresia', 'solicitud', 'info', 'miembro', 'fyi'],
    'salud': ['salud'],
    'identificacion': ['identificacion', 'identificación', 'licencia'],
    'penales': ['penales', 'antecedentes', 'conducta', 'cap'],
    'osha': ['osha'],
    'recibo-p5': ['rebibo_p5'],
    'dopaje': ['dopaje', 'dopage'],
    'banco': ['banco', 'bancaria'],
    'formulario-499': ['499'],
    'formulario-I9': ['I9', 'I-9'],
}
RECIBO_P5 = ['recibo-dopaje', 'recibo de dopaje', 'factura-dopaje', 'recibo-p5', 'factura-p5']


### Build functions

In [None]:
def authenticate(service_account_file: Path = SERVICE_ACCOUNT_FILE, scopes: list = SCOPES):
    """Authenticates with the Google Drive API using a service account."""
    return Credentials.from_service_account_file(service_account_file, scopes=scopes)

def build_service(creds: Credentials, folder_id: str) -> object:
    """Builds the Google Drive service."""
    try:
        # build the connection (service) to the API for v3 of Google Drive
        service = build("drive", "v3", credentials=creds)

        print(f"Looking for folders in folder: {folder_id}\n")

        return service

    except HttpError as error:
        print(f"An error occurred: {error}")
    except FileNotFoundError:
        print(f"ERROR: The service account file was not found.")
        print(f"Please make sure the file '{SERVICE_ACCOUNT_FILE}' is in the correct directory.")

def get_results(
    service: Resource, 
    folder_id: str, 
    page_size: int = 100, 
    fields: str = "nextPageToken, files(id, name, mimeType)", 
    mimeType: Optional[str] = None
) -> List[Dict[str, Any]]:
    
    """Return files in a Drive folder, optionally filtered by mimeType."""
    # Build the query safely, adding mimeType only if provided
    parts = [f"'{folder_id}' in parents", "trashed = false"]
    if mimeType:
        parts.append(f"mimeType = '{mimeType}'")
    query = " and ".join(parts)

    try:
        items: List[Dict[str, Any]] = []
        page_token: Optional[str] = None

        # Builds a dictionary with key 'files' and a list of dicts, one dict per file, with keys 'name' and 'id'
        while True:
            resp = (
                service.files()
                .list(q=query, pageSize=page_size, fields=fields, pageToken=page_token)
                .execute()
            )
            # Add items with files
            items.extend(resp.get("files", []))
            # page_token = bookmark - this is where you left off
            page_token = resp.get("nextPageToken")
            # Break when get to last page
            if not page_token:
                break
            
        return items

    except HttpError as e:
        print(f'Drive API error: {e}')
        return []

def remove_diacritics(text: str) -> str:
    """Return text without accents/diacritics."""
    nfkd_form = unicodedata.normalize("NFD", text)
    return "".join(ch for ch in nfkd_form if not unicodedata.combining(ch))

def canon(name: str) -> str:
    """Canonicalize a filename for matching: remove accents, normalize spaces/dashes."""
    # remove diacritics
    s = remove_diacritics(name)
    # lowercase
    s = s.lower()
    # normalize dashes/underscores to spaces
    s = re.sub(r"[-_]+", " ", s)
    # collapse multiple spaces
    s = re.sub(r"\s+", " ", s)
    # strip leading/trailing spaces
    s = s.strip()
    return s

def decide_rename(name: str, d: dict = REQUIRED_PDFS) -> str:
    '''iterate through substrings to assign new name'''
    for k, v in d.items():
        if any(s in name for s in v):
            new_file_name = k
            return new_file_name

def build_inventory(files: list, d: dict = REQUIRED_PDFS) -> list:
    '''builds a dict with keys as required file_names and values as whether or not they exist in folder'''
    req_pdfs = [k for k in d.keys()]
    file_names = [k for k in files.keys()]
    inventory = defaultdict(bool)
    for f in req_pdfs:
        inventory[f] = f in file_names
    
def rename_file(service: Resource, file_id: str, new_name: str):
    """Rename a Google Drive file by ID, leaving everything else unchanged."""
    try:
        updated_file = (
            service.files()
            .update(
                fileId=file_id,
                body={"name": new_name},
                fields="id, name"
            )
            .execute()
        )
        print(f"Renamed file {file_id} to {updated_file['name']}")
        return updated_file
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def get_jpg(service: Resource, file_id: str) -> io.BytesIO:
    '''convert '''
    # 1. Download the JPG file into memory
    # get file
    request = service.files().get_media(fileId=file_id)
    # create a file-like object to save the downloaded content
    fh = io.BytesIO()
    # create a downloader object
    downloader = MediaIoBaseDownload(fh, request)
    # download the file
    done = False
    while done is False:
        status, done = downloader.next_chunk()
    
    return fh

def convert_to_pdf(fh: io.BytesIO) -> io.BytesIO:
    # Convert the JPG image to a PDF using Pillow
    fh.seek(0) # Go to the beginning of the in-memory file
    with Image.open(fh) as img:
        img = ImageOps.exif_transpose(img)  # Correct orientation using EXIF data
        if img.mode in ("RGBA", "P"):  # Convert to RGB if necessary
            img = img.convert("RGB")
    # create a BytesIO object to save the PDF
    pdf_bytes = io.BytesIO()
    # convert and save as PDF
    img.save(pdf_bytes, format="PDF")
    pdf_bytes.seek(0)

    return pdf_bytes

def upload_to_gdrive(
        service: Resource,
        fh: io.BytesIO, 
        file_name: str, 
        ext: str, 
        mime_type: str,
        subfolder_id: str
) -> None:
    
    '''Upload file to gdrive folder'''
    fh.seek(0)  # Go to the beginning of the in-memory file
    hr = file_name + ext
    media = MediaIoBaseUpload(fh, mimeType=mime_type)

    file = (
        service.files()
        .create(body={"name": hr, "parents": [subfolder_id]},
        media_body=media, 
        fields="id"
        )
        .execute()
    )

def load_to_downloads(bytesObj: io.BytesIO, ext: str) -> None:
    """Load a file to the user's Downloads folder to see samples.""" 
    
    bytesObj.seek(0)

    # choose where to save (example: ~/Downloads/myfile.jpg)
    downloads_path = Path.home() / "Downloads" / "myfile"
    downloads_path = downloads_path.with_suffix(ext)

    with open(downloads_path, "wb") as f:
        f.write(bytesObj.read())

    print(f"File saved to: {downloads_path}")

def main():
    creds = authenticate()
    subfolders = 

In [None]:


def main():
    """
    Our first step: Authenticate and list JPG files from a specific folder.
    """
    '''    if FOLDER_ID == "YOUR_FOLDER_ID_HERE":
        print(
            "Please open drive_converter.py and replace 'YOUR_FOLDER_ID_HERE' with your actual Google Drive folder ID."
        )
        return'''

    creds = authenticate()

    try:
        # build the connection (service) to the API for v3 of Google Drive
        service = build("drive", "v3", credentials=creds)

        print(f"Looking for folders in folder: {FOLDER_ID}\n")

        # Query to find all folders in the specified folder
        query = f"'{FOLDER_ID}' in parents and mimeType='application/vnd.google-apps.folder'"
        
        # Get a dictionary with key 'files' and a list of dicts with 'id' and 'name' keys
        results = (
            service.files().list(q=query, pageSize=PAGE_SIZE, fields="files(id, name)").execute()
        )

        # Get the list of dicts with 'id' and 'name' keys
        subfolders = results.get("files", [])

        if not subfolders:
            print(f"No google folder found in {FOLDER_ID}.")
            print("Please check:")
            print("1. The folder ID is correct.")
            print(f"2. The service account email has been shared on that folder.")
            print("3. There are actually folders in that folder.")

        else:
            print(f"Found {len(subfolders)} folders:")

        # for test. remove later
        subfolders = [subfolder for subfolder in subfolders if subfolder['id'] == '16lD8ghXV0SQg8_TfK1GZrGCoEw9jPtzW']

        for subfolder in subfolders:
            print(f"Processing folder: {subfolder['name']} (ID: {subfolder['id']})")
            subfolder_id = subfolder['id']
            subfolder_name = subfolder['name'].lower()

            # Query to find all jpgs in the specified folder
            jpg_query = f"'{subfolder_id}' in parents and mimeType='image/jpeg'"

            # Get a dictionary with key 'files' and a list of dicts with 'id' and 'name' keys
            results = (
                service.files().list(q=jpg_query, pageSize=PAGE_SIZE, fields="files(id, name)").execute()
            )

            # Get the list of dicts with 'id' and 'name' keys for each jpg
            jpgs_in_subfolder = results.get("files", [])
            if not jpgs_in_subfolder:
                print(f"  --> {subfolder['name']} does not contain jpgs. Skipping.")
                continue

            else:
                for jpg in jpgs_in_subfolder:
                    file_name = jpg["name"]
                    file_id = jpg["id"]

                    # --- Optional: Skip files that don't match our filter ---
                    if STRINGS_IN_FILE_NAMES and not any(s in file_name for s in STRINGS_IN_FILE_NAMES):
                        print(f"  - Skipping '{file_name}' as it does not match filter.")
                        continue

                    print(f"  - Processing '{file_name}'...")

                    # 1. Download the JPG file into memory
                    # get file
                    request = service.files().get_media(fileId=file_id)
                    # create a file-like object to save the downloaded content
                    fh = io.BytesIO()
                    # create a downloader object
                    downloader = MediaIoBaseDownload(fh, request)
                    # download the file
                    done = False
                    while done is False:
                        status, done = downloader.next_chunk()

                    # 2. Convert the JPG image to a PDF using Pillow
                    fh.seek(0) # Go to the beginning of the in-memory file
                    with Image.open(fh) as img:
                        img = ImageOps.exif_transpose(img)  # Correct orientation using EXIF data
                        if img.mode in ("RGBA", "P"):  # Convert to RGB if necessary
                            img = img.convert("RGB")
                    # create a BytesIO object to save the PDF
                    pdf_bytes = io.BytesIO()
                    # convert and save as PDF
                    img.save(pdf_bytes, format="PDF")

                    # 3. Upload the converted PDF back to the same folder
                    pdf_bytes.seek(0)  # Go to the beginning of the in-memory file
                    pdf_filename = file_name + ".pdf"
                    media = MediaIoBaseUpload(pdf_bytes, mimetype="application/pdf")

                    file = (
                        service.files()
                        .create(body={"name": pdf_filename, "parents": [subfolder_id]},
                        media_body=media, 
                        fields="id"
                        )
                        .execute()
                    )
                    print(f"    ...Success! Uploaded as '{pdf_filename}'.")
        
    except HttpError as error:
        print(f"An error occurred: {error}")
    except FileNotFoundError:
        print(f"ERROR: The service account file was not found.")
        print(f"Please make sure the file '{SERVICE_ACCOUNT_FILE}' is in the correct directory.")




In [78]:
main()

Looking for folders in folder: 1dLMsDpWrn3mr3UPNX1Pp4Mem9iEf9W__

Found 41 folders:
Processing folder: Angela Matos Pino (ID: 16lD8ghXV0SQg8_TfK1GZrGCoEw9jPtzW)
  - Processing 'miembrecia.jpeg'...
An error occurred: <HttpError 403 when requesting https://www.googleapis.com/upload/drive/v3/files?fields=id&alt=json&uploadType=multipart returned "Service Accounts do not have storage quota. Leverage shared drives (https://developers.google.com/workspace/drive/api/guides/about-shareddrives), or use OAuth delegation (http://support.google.com/a/answer/7281227) instead.". Details: "[{'message': 'Service Accounts do not have storage quota. Leverage shared drives (https://developers.google.com/workspace/drive/api/guides/about-shareddrives), or use OAuth delegation (http://support.google.com/a/answer/7281227) instead.', 'domain': 'usageLimits', 'reason': 'storageQuotaExceeded'}]">
