In [5]:
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.cloud import storage
import os
import io
import pickle


def authenticate():
    creds = None
    # Check if token file exists
    if os.path.exists("token.pickle"):
        with open("token.pickle", "rb") as token:
            creds = pickle.load(token)

    # If credentials don't exist or are invalid, get new ones
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            # Load client secrets from downloaded file
            flow = InstalledAppFlow.from_client_secrets_file(
                "/workspaces/BRIDGE/client_secret_oauth2_bridge.json", SCOPES
            )
            # Run the authorization flow
            creds = flow.run_local_server(port=34817)

        # Save the credentials for future runs
        with open("token.pickle", "wb") as token:
            pickle.dump(creds, token)

    return creds


# Define the scopes your application needs
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]

In [12]:
# Constants
DRIVE_FOLDER_ID = "1lUo6mgTbdDhHZs8gOODXERycOm2mEVrn"
BUCKET_NAME = "bridge-ml-training"
SERVICE_ACCOUNT_FILE = "/workspaces/BRIDGE/bridge-service-account.json"

# Setup clients
creds = authenticate()
drive = build("drive", "v3", credentials=creds)
gcs = storage.Client.from_service_account_json(SERVICE_ACCOUNT_FILE)
bucket = gcs.bucket(BUCKET_NAME)

# List files in folder
files = (
    drive.files()
    .list(q=f"'{DRIVE_FOLDER_ID}' in parents", fields="files(id, name)")
    .execute()
    .get("files", [])
)

for file in files:
    print(f"Found file: {file['name']} ({file['id']})")
    if not file["name"].endswith(".csv"):
        continue

    # Convert filename (aa -> 1, ab -> 2, etc.)
    name = file["name"].split(".")[0]
    index = (ord(name[0]) - ord("a")) * 26 + (ord(name[1]) - ord("a") + 1)
    print(f"Index: {index}")

    # Download from Drive
    request = drive.files().get_media(fileId=file["id"])
    content = io.BytesIO()
    downloader = MediaIoBaseDownload(content, request)
    done = False
    while not done:
        _, done = downloader.next_chunk()

    # Upload to GCS
    content.seek(0)
    blob = bucket.blob(f"pretraining/{index}/data_{index}.csv")
    blob.upload_from_file(content)
    print(
        f"Uploaded {file['name']} → gs://{BUCKET_NAME}/pretraining/{index}/data_{index}.csv"
    )

Found file: bx.csv (15Qbck_oNrZ3Mo-kx1gJW_LsMWekFuc37)
Index: 50
Uploaded bx.csv → gs://bridge-ml-training/pretraining/50/data_50.csv
Found file: bq.csv (15i6OWL6tHzKQ3TOXI96hdCeNPRsDh5nW)
Index: 43
Uploaded bq.csv → gs://bridge-ml-training/pretraining/43/data_43.csv
Found file: bt.csv (1AOI6OCXWMsJ5a-ubwF2Z7H9UJapywVJj)
Index: 46
Uploaded bt.csv → gs://bridge-ml-training/pretraining/46/data_46.csv
Found file: bp.csv (1-4PzxSGDENDMxGsf74kmQu1v6PGoZNh-)
Index: 42
Uploaded bp.csv → gs://bridge-ml-training/pretraining/42/data_42.csv
Found file: br.csv (1R6MFynfg4IVBTAIQLOkGDlLfYI5EY45m)
Index: 44
Uploaded br.csv → gs://bridge-ml-training/pretraining/44/data_44.csv
Found file: bu.csv (1gfsCcgTQLZ9loyWgqu1Qt77sQltOwtDH)
Index: 47
Uploaded bu.csv → gs://bridge-ml-training/pretraining/47/data_47.csv
Found file: bw.csv (1sodxLiaQM8NN4V8d8_m3u9wT8MUirdil)
Index: 49
Uploaded bw.csv → gs://bridge-ml-training/pretraining/49/data_49.csv
Found file: bs.csv (1jadO_zTDwCZsYUwQyM7YdMDXDZqjvPlp)
Index: 