In [4]:
from google.oauth2.service_account import Credentials
from googleapiclient.errors import HttpError
from google_auth_httplib2 import AuthorizedHttp
from googleapiclient.discovery import build
from collections import Counter
import numpy as np
import httplib2
import time
import sys
import os

root = '.'

# This is a modified version of the function list_files in src.py - it returns a list instead of a dict
def list_files(service_file, folder_id):

    """
    List files in a Google Drive folder.

    Parameters
    ----------
    service_file : str
        Path to the service account JSON file.
    folder_id : str
        ID of the Google Drive folder.

    Returns
    -------
    set[str]
        Set of file names in the folder.
    """
    creds = Credentials.from_service_account_file(service_file, scopes=["https://www.googleapis.com/auth/drive.readonly"])
    for attempt in range(10):
        try:
            http = httplib2.Http(timeout=1800)
            authed_http = AuthorizedHttp(creds, http=http)
            service = build("drive", "v3", http=authed_http)
            query = f"'{folder_id}' in parents and trashed=false"
            names, token = list(), None
            while True:
                results = service.files().list(
                    q=query,
                    fields="nextPageToken, files(name)",
                    pageSize=1000,
                    pageToken=token,
                    supportsAllDrives=True,
                    includeItemsFromAllDrives=True
                ).execute()
                names.extend(f["name"] for f in results.get("files", []))
                token = results.get("nextPageToken")
                if not token:
                    break
            return names

        except (HttpError, OSError, TimeoutError):
            if attempt == 9:
                raise
            time.sleep(5)

# Manually create a list with all files we must find in 'libraries'
files_in_libraries = []
files_in_libraries += [f"Enamine_REAL_NaturalProducts_{str(i).zfill(3)}.tsv.zip" for i in range(52)]  # 517,797,846 compounds
files_in_libraries += [f"Enamine_REAL_Sample_{str(i).zfill(3)}.tsv.zip" for i in range(104)]  # 1,035,352,518 compounds
files_in_libraries += [f"Enamine_REAL_LeadLike_{str(i).zfill(3)}.tsv.zip" for i in range(838)]  # 8,371,778,942 compounds

# Manually create a list with all files we must find in 'ecfps'
files_in_ecfps = [i.replace(".tsv.zip", "_SMILES_IDs.tsv.zip") for i in files_in_libraries]
files_in_ecfps += [i.replace(".tsv.zip", "_X.npz") for i in files_in_libraries]

In [5]:
# Define some variables
data_dir = os.path.join(root, "..", "data")
FOLDER_ID_LIBRARY = "1bWrCvi5FXodxQ2S88nYLecHDjk5Jer8Y"
FOLDER_ID_ECFP = "1FBELagBf9hlKVgvkaZ8YF60jKRAmsHPo"
PATH_TO_SERVICE = os.path.join(data_dir, "service.json")

# Get files in Google Drive
FILES_IN_LIBRARIES = list_files(PATH_TO_SERVICE, FOLDER_ID_LIBRARY)
FILES_IN_ECFP = list_files(PATH_TO_SERVICE, FOLDER_ID_ECFP)

In [6]:
# LIBRARIES
assert len(FILES_IN_LIBRARIES) == len(files_in_libraries)
assert len([i for i in files_in_libraries if i in FILES_IN_LIBRARIES]) == len(files_in_libraries)
assert len([i for i in FILES_IN_LIBRARIES if i in files_in_libraries]) == len(FILES_IN_LIBRARIES)

In [7]:
# ECFP6s
assert len(FILES_IN_ECFP) == len(files_in_ecfps)
assert len([i for i in files_in_ecfps if i in FILES_IN_ECFP]) == len(files_in_ecfps)
assert len([i for i in FILES_IN_ECFP if i in files_in_ecfps]) == len(FILES_IN_ECFP)

In [8]:
len(FILES_IN_ECFP) , len(files_in_ecfps)

(1988, 1988)

In [9]:
TO_RUN = sorted(set(["_".join(i.split("_")[:4]) + ".tsv.zip" for i in files_in_ecfps if i not in FILES_IN_ECFP]))[::-1]
TO_RUN = [np.where(np.array(files_in_libraries) == i)[0][0] for i in TO_RUN]
print(" ".join(np.array(TO_RUN).astype(str)))


