In [1]:
import os
import google
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload

# Set the API scope and credentials file￼
SCOPES = ['https://www.googleapis.com/auth/drive.file']

# You should ask for this credentials.json. It depends on which service-account you are using
CREDS_FILE = 'credentials.json'



In [7]:
# You need to give your service account editor access to this PARENT FOLDER
PARENT_FOLDER_ID = "FILL ME "

creds = service_account.Credentials.from_service_account_file(
    'credentials_2.json',
    scopes=['https://www.googleapis.com/auth/drive'] 
)

In [12]:
class GoogleDriveUploader:
    def __init__(self, credentials):
        try:
            # create drive api client
            self.instantiate_service(creds)
            self.created_folders = {}
            self.uploaded_files = []
            self.failed_files = []
        except Exception as e:
            print(f"Error building drive service {e}")
            raise(e)

    def instantiate_service(self, credentials):
        self.service = build("drive", "v3", credentials=creds)

    def set_uploaded_files(self, new_uploaded_files):
        self.uploaded_files = new_uploaded_files

    def set_failed_files(self, new_failed_files):
        self.failed_files = new_failed_files

    def set_created_folders(self, new_created_folders):
        self.created_folders = new_created_folders

    def search_files(folder_id):
        try:
            page_number = 1
            files, names = [], []
            page_token = None
            while True:
                # pylint: disable=maybe-no-member
                print(f"Next page {page_number}")
                response = (
                  self.service.files()
                  .list(
                      q=f"'{folder_id}' in parents",
                      spaces="drive",
                      fields="nextPageToken, files(id, name)",
                      pageToken=page_token,
                  )
                  .execute()
                )
                response_files = [{"id": file.get("id"), "name": file.get("name")} for file in response.get("files", [])]
                response_names = [response["name"] for response in response_files]
                files.extend(response_files)
                names.extend(response_names)
                page_token = response.get("nextPageToken", None)
                if page_token is None:
                    print("Finished searching files")
                    break
                page_number += 1
                
        except Exception as error:
            print(f"An error occurred: {error}")
        
        return files, names
        
    def create_folder_if_not_exists(self, parent_folder_id, directory, folder_name):
        created_folder_id = self.created_folders.get(directory + folder_name)
        if created_folder_id is not None:
            return created_folder_id

        # La carpeta no existe, así que la creamos
        file_metadata = {
            'name': folder_name,
            'parents': [parent_folder_id],
            'mimeType': 'application/vnd.google-apps.folder'
        }
        folder = self.service.files().create(body=file_metadata, fields='id').execute()
        print(f'Carpeta creada con ID: {folder["id"]}')
        self.created_folders[directory + folder_name] = folder['id']
        return folder['id']

    
    def upload_file(self, complete_name, file_name, parent_folder_id):
        if complete_name in self.uploaded_files:
            print(f"already uploaded - skipping file {complete_name}")
            return
                
        try:
            file_metadata = {
                "name": file_name,
                "parents": [parent_folder_id]
            }
            media = MediaFileUpload(complete_name)
    
            new_file = (
                self.service.files()
                .create(body=file_metadata, media_body=media, fields="id")
                .execute()
            )
            print(f'file_name {file_name} - File ID: {new_file.get("id")}')
            self.uploaded_files.append(complete_name)
        except Exception as error:
            self.failed_files.append(complete_name)
            print(f"An error occurred: {error} with file_name {file_name}")

    def upload_folder(self, directory, file_names, parent_folder_id):
        # Itero por los archivos
        for file_name in sorted(file_names):
            complete_name = directory + file_name
            if os.path.isdir(complete_name):
                new_parent_folder_id = self.create_folder_if_not_exists(parent_folder_id, directory, file_name)
                self.upload_folder(complete_name + "/", os.listdir(complete_name), new_parent_folder_id)
            else:
                self.upload_file(complete_name, file_name, parent_folder_id)

        self.failed_files = list(set(self.failed_files).difference(set(self.uploaded_files)))
        return self.uploaded_files

In [13]:
gdu = GoogleDriveUploader(creds)

In [22]:
len(gdu.uploaded_files), len(gdu.failed_files)

(56304, 5)

In [None]:
directory = "/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/"

file_names = [dir for dir in os.listdir(directory)]
print(f"About to upload {len(file_names)} files")
print(file_names)
uploaded = gdu.upload_folder(directory, file_names, PARENT_FOLDER_ID)

About to upload 10 files
['124-30-720x1280', '329', '149', '115-30-1280x720', 'video72', '76-30-640x280', '146', '147', '102', '28-30-1280x720-1']
already uploaded - skipping file /home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00001.jpg
already uploaded - skipping file /home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00002.jpg
already uploaded - skipping file /home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00003.jpg
already uploaded - skipping file /home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00004.jpg
already uploaded - skipping file /home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00005.jpg
already uploaded - skipping file /home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00006.jpg
already uploaded - skipping file /home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00007.jpg
already uploaded - skipping file /home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00008.j

In [29]:
len(gdu.uploaded_files), len(gdu.failed_files)

(66803, 0)

In [30]:
gdu.failed_files

[]

## Save list of files uploaded

In [31]:
import pandas as pd

In [32]:
df = pd.DataFrame(gdu.uploaded_files)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66803 entries, 0 to 66802
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       66803 non-null  object
dtypes: object(1)
memory usage: 522.0+ KB


In [33]:
df.to_csv('uploaded_file_names_AFFWILD2.csv', sep=',')

In [34]:
uploaded

['/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00001.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00002.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00003.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00004.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00005.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00006.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00007.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00008.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00009.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00010.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00011.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/102/00012.jpg',
 '/home/nicolas/trabajo-profesional/datasets/Aff-Wild2/frames/10

### Now we list all file names in local

In [23]:
local_file_names = os.listdir(directory)
print(f"We have {len(local_file_names)} local files")

We have 68096 local files


### Diff

In [24]:
local_names = set(local_file_names)
uploaded_names = set(names)
missing = local_names.difference(uploaded_names)
missing

{'amazed_man_9.jpg',
 'angry_husband_28.jpg',
 'angry_woman_224.jpg',
 'astound_mother_221.jpg',
 'awe_american_770.jpg',
 'awe_people_25.jpg',
 'crying_boy_504.jpg',
 'crying_lady_331.jpg',
 'distaste_actor_117.jpg',
 'heartbroken_boss_780.jpg'}

## Upload only missing

In [26]:
missing

{'amazed_man_9.jpg',
 'angry_husband_28.jpg',
 'angry_woman_224.jpg',
 'astound_mother_221.jpg',
 'awe_american_770.jpg',
 'awe_people_25.jpg',
 'crying_boy_504.jpg',
 'crying_lady_331.jpg',
 'distaste_actor_117.jpg',
 'heartbroken_boss_780.jpg'}

### Search files in a directory...
Use gdu.search_files()