Provide `gdrive_path` per the following examples

Examples:
* `//My Drive/` instructs the script to copy everything in your Google Drive
* `//DeptName/` instructs the script to copy everything in the DeptName Shared Google Drive
* `//My Drive/path/to/files` instructs the script to copy everything in the `/path/to/files` directory of your  Google Drive

In [None]:
# Basic Google Drive info
gdrive_path = '//My Drive/'

A few words to consider when deciding which Bucket to copy files into...
* It is recommended that a separate Bucket be used for each Google Drive
* If deemed appropriate, you may use one Bucket for shared Google Drives


In [None]:
# Basic Bucket information
project_id = 'YOUR_GCP_PROJECT_ID' # e.g. bborie-sandbox
bucket_name = 'YOUR_BUCKET_NAME_HERE' # e.g. bborie-sandbox

In [None]:
# Basic Script info
num_workers = 8
dry_run = False

# Do not edit anything after this Text block. Just execute

In [None]:
from collections import deque
import io
import logging
import os
import os.path as osp
import queue
import tempfile
import threading
import time
from types import SimpleNamespace

from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import google.auth
from google.cloud import storage
from google.colab import auth, drive
import requests

In [None]:
# Authenticate into GCP to access your Bucket
auth.authenticate_user()

In [None]:
os.environ['GOOGLE_CLOUD_PROJECT'] = project_id

CONNECT_TIMEOUT = 10 # in seconds
READ_TIMEOUT = 900 # 15 minutes
TIMEOUT = (CONNECT_TIMEOUT, READ_TIMEOUT)

CHUNK_SIZE = 10 * 1024 * 1024 # 10 MB upload chunks

FOLDER_MIME_TYPE = 'application/vnd.google-apps.folder'
MIME_TYPE_MAP = {
    'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
    'application/vnd.google-apps.drawing': 'image/png',
    'application/vnd.google-apps.presentation': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
    'application/vnd.google-apps.spreadsheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
    'application/vnd.google-apps.script': 'application/vnd.google-apps.script+json',
}

file_queue = queue.Queue()
LOCK = threading.Lock()

logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
class UnsupportedMimeType(Exception):
    pass

def clean_value(v):
    return v.replace("'", "\\'")

def iterfiles(service, name=None, is_folder=None, parent=None, drive_id=None, order_by='folder,name,createdTime'):
    q = ['trashed=false']
    if name is not None:
        q.append(f"name='{clean_value(name)}'")
    if is_folder is not None:
        q.append(f"mimeType {'=' if is_folder else '!='} '{FOLDER_MIME_TYPE}'")
    if parent is not None:
        q.append(f"'{clean_value(parent)}' in parents")

    params = {'pageToken': None, 'orderBy': order_by}
    if q:
        params['q'] = ' and '.join(q)

    if drive_id:
        params['driveId'] = drive_id
        params['corpora'] = 'drive'
        params['includeItemsFromAllDrives'] = True
        params['supportsAllDrives'] = True

    while True:
        response = service.files().list(**params).execute()
        for f in response['files']:
            yield f
        try:
            params['pageToken'] = response['nextPageToken']
        except KeyError:
            return

def walk(service, top='root', by_name=False, drive_id=None):
    if by_name:
        top, = iterfiles(service, name=top, is_folder=True, drive_id=drive_id)
    else:
        params = dict(fileId=top)
        if drive_id:
            params['supportsAllDrives'] = True
        top = service.files().get(**params).execute()
        if top['mimeType'] != FOLDER_MIME_TYPE:
            raise ValueError(f'not a folder: {top!r}')

    stack = [((top['name'],), top)]
    while stack:
        path, top = stack.pop()

        dirs, files = is_file = [], []
        for f in iterfiles(service, parent=top['id'], drive_id=drive_id):
            is_file[f['mimeType'] != FOLDER_MIME_TYPE].append(f)

        yield path, top, dirs, files

        if dirs:
            stack.extend((path + (d['name'],), d) for d in reversed(dirs))

def get_drive_service():
    return build('drive', version='v3', cache_discovery=False)

def get_storage_client(project):
    return storage.Client(project=project)

def init_workers(num_workers=8, thread_kwargs=None):
    thread_kwargs = thread_kwargs or {}
    file_workers = [
        threading.Thread(target=copy_file, daemon=True, kwargs=thread_kwargs)
        for x in range(num_workers)
    ]
    list(map(lambda x: x.start(), file_workers))

    return file_workers

def copy_file(project, bucket, dry_run=False):
    with LOCK:
        creds = google.auth.default()[0]
        service = get_drive_service()
        client = get_storage_client(project)
        bucket = client.bucket(bucket)

    while True:
        try:
            f = file_queue.get()
            if f is None:
                break

            _copy_file(f, service, creds, bucket, dry_run=dry_run)
        except Exception as e:
            log('error', f'Error copying: {f.drivePath} => gs://{bucket.name}/{f.bucketPath}')
            log('error', e)
        finally:
            file_queue.task_done()

def _copy_file(f, service, creds, bucket, dry_run=False):
    with tempfile.NamedTemporaryFile() as fh:
        if dry_run:
            log('info', f'Copied: {f.id} {f.drivePath} => gs://{bucket.name}/{f.bucketPath}')
            return

        #
        # download
        #

        file = io.FileIO(fh.name, 'w+')
        if '.google-apps.' in f.mimeType:
            mime_type = MIME_TYPE_MAP.get(f.mimeType)
            if not mime_type:
                raise UnsupportedMimeType()

            f.mimeType = mime_type

            response = service.files().get(fileId=f.id, supportsAllDrives=True, fields='*').execute()
            response = requests.get(
                response['exportLinks'][mime_type],
                headers={
                    'Authorization': 'bearer {creds.access_token}'
                },
                stream=True
            )
            response.raise_for_status()
            for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                fh.write(chunk)
        else:
            req = service.files().get_media(fileId=f.id)
            downloader = MediaIoBaseDownload(file, req)
            done = False
            while not done:
                _, done = downloader.next_chunk()

        #
        # upload
        #

        blob = bucket.blob(f.bucketPath, chunk_size=CHUNK_SIZE)
        blob.upload_from_filename(fh.name)
        blob.content_type = f.mimeType
        blob.patch()

        log('info', f'Copied: {f.id} {f.drivePath} => gs://{bucket.name}/{f.bucketPath}')

def log(level, msg):
    fn = getattr(logger, level)
    message = f'({threading.get_ident()}) {msg}'
    fn(message)

def copy_from_gdrive_to_bucket(project, drive, bucket, workers, dry_run):
    if not drive.startswith('//'):
        drive = '//' + osp.join('My Drive', (drive if drive[0] != '/' else drive[1:]))
        logger.info(f'Assuming "My Drive": {drive}')

    drive_name, drive_prefix = drive[2:].split('/', 1)
    if drive_prefix.endswith('/'):
        drive_prefix = drive_prefix[:-1]
    drive_parts = drive_prefix.split('/')

    drive_id = None
    drive_parent = 'root'

    service = get_drive_service()
    creds = google.auth.default()[0]

    file_workers = init_workers(
        workers,
        thread_kwargs={
            'project': project,
            'bucket': bucket,
            'dry_run': dry_run,
        }
    )

    if drive_name.strip().lower() != 'my drive':
        # go look up driveId
        response = service.drives().list(q=f"name='{clean_value(drive_name)}'").execute()
        drive_id = next(
            drive_dict['id']
            for drive_dict in response['drives']
        )
        drive_parent = None

    for part in drive_parts:
        for f in iterfiles(service, name=part, parent=drive_parent, is_folder=True, drive_id=drive_id):
            drive_parent = f['id']

    dq = deque([], 1000)
    for _path, root, dirs, files in walk(service, top=drive_parent, by_name=False, drive_id=drive_id):
        for f in files:
            f = SimpleNamespace(**f)
            f.drivePath = osp.join(drive, *_path[1:], f.name)
            bucket_path = f.bucketPath = osp.join(drive[2:], *_path[1:], f.name)
            if bucket_path in dq:
                count = dq.count(bucket_path)
                f.bucketPath += f' ({count})'
            dq.append(bucket_path)
            file_queue.put(f)

    while True:
        qsize = file_queue.qsize()
        logger.info(f'Estimate to process: {qsize}')
        if qsize < 5:
            break
        time.sleep(10)

    # gracefully shut down workers
    for x in range(len(file_workers)):
        file_queue.put(None)

    file_queue.join()

    logger.info('All done! Bye...')

In [None]:
copy_from_gdrive_to_bucket(project_id, gdrive_path, bucket_name, num_workers, dry_run)