In [87]:
from google.cloud.storage import Client, transfer_manager

def download_bucket_with_transfer_manager(bucket_name, destination_directory="", workers=8, max_results=1000):

    storage_client = Client()
    bucket = storage_client.bucket(bucket_name)

    blob_names = [blob.name for blob in bucket.list_blobs(max_results=max_results)]

    results = transfer_manager.download_many_to_path(
        bucket, blob_names, destination_directory=destination_directory, max_workers=workers
    )

    for name, result in zip(blob_names, results):
        # The results list is either `None` or an exception for each blob in
        # the input list, in order.

        if isinstance(result, Exception):
            print("Failed to download {} due to exception: {}".format(name, result))
        else:
            print("Downloaded {} to {}.".format(name, destination_directory + name))

In [35]:
project_path='/home/remo/code/victor-ocn/covid19-project/covid19/Covid19'
print(project_path)

/home/remo/code/victor-ocn/covid19-project


In [27]:
# Make cloud folder:
!mkdir -p {project_path}/raw_data/cloud/

In [7]:
download_bucket_with_transfer_manager(
   'covid19_lewagon', destination_directory=f"{project_path}/raw_data/cloud", workers=8, max_results=25500
)

Downloaded COVID-19_Radiography_Dataset/COVID/COVID-1.png to /home/remo/code/victor-ocn/covid19-project/raw_data/cloudCOVID-19_Radiography_Dataset/COVID/COVID-1.png.
Downloaded COVID-19_Radiography_Dataset/COVID/COVID-10.png to /home/remo/code/victor-ocn/covid19-project/raw_data/cloudCOVID-19_Radiography_Dataset/COVID/COVID-10.png.
Downloaded COVID-19_Radiography_Dataset/COVID/COVID-100.png to /home/remo/code/victor-ocn/covid19-project/raw_data/cloudCOVID-19_Radiography_Dataset/COVID/COVID-100.png.
Downloaded COVID-19_Radiography_Dataset/COVID/COVID-1000.png to /home/remo/code/victor-ocn/covid19-project/raw_data/cloudCOVID-19_Radiography_Dataset/COVID/COVID-1000.png.
Downloaded COVID-19_Radiography_Dataset/COVID/COVID-1001.png to /home/remo/code/victor-ocn/covid19-project/raw_data/cloudCOVID-19_Radiography_Dataset/COVID/COVID-1001.png.
Downloaded COVID-19_Radiography_Dataset/COVID/COVID-1002.png to /home/remo/code/victor-ocn/covid19-project/raw_data/cloudCOVID-19_Radiography_Dataset/CO

In [41]:
# Transfer xray-dataset-covid-pneumonia dataset to Kaggle folder:
# covid
!cp -r {project_path}/raw_data/cloud/xray-dataset-covid-pneumonia/'Covid19 (1)'/*.jpg  {project_path}/raw_data/cloud/COVID-19_Radiography_Dataset/COVID/
!cp -r {project_path}/raw_data/cloud/xray-dataset-covid-pneumonia/'Covid19 (2)'/*.jpg {project_path}/raw_data/cloud/COVID-19_Radiography_Dataset/COVID/
# pneumonia
!cp -r {project_path}/raw_data/cloud/xray-dataset-covid-pneumonia/Pneumonia/*.jpg {project_path}/raw_data/cloud/COVID-19_Radiography_Dataset/'Viral Pneumonia'/

zsh:1: no matches found: /home/remo/code/victor-ocn/covid19-project/raw_data/cloud/xray-dataset-covid-pneumonia/Covid19 (2)/*.jpg


In [43]:
import os

## Make variables with paths
main_dir = f"{project_path}/raw_data/cloud/COVID-19_Radiography_Dataset"
covid = os.path.join(main_dir, "COVID")
normal = os.path.join(main_dir, "Normal")
pneumonia = os.path.join(main_dir, "Viral Pneumonia")
opacity = os.path.join(main_dir, "Lung_Opacity")

## List files names:
covid_names = os.listdir(covid)
normal_names = os.listdir(normal)
pneumonia_names = os.listdir(pneumonia)
opacity_names = os.listdir(opacity)


In [44]:
## Check folders:
total_names = len(covid_names)+len(normal_names)+len(pneumonia_names)+len(opacity_names)

print(f"covid_names: {len(covid_names)} - {round(100*len(covid_names)/total_names, 1)}%")
print(f"normal_names: {len(normal_names)} - {round(100*len(normal_names)/total_names,1)}%")
print(f"pneumonia_names: {len(pneumonia_names)} - {round(100*len(pneumonia_names)/total_names, 1)}%")
print(f"opacity_names: {len(opacity_names)} - {round(100*len(opacity_names)/total_names, 1)}%")
print('')
print(f'Total: {total_names}')

covid_names: 4319 - 17.1%
normal_names: 10192 - 40.3%
pneumonia_names: 4763 - 18.8%
opacity_names: 6012 - 23.8%

Total: 25286


In [47]:
## Make Classs folders in train & test
cloud_dir = os.path.join(project_path,'raw_data','cloud')

data = os.path.join(cloud_dir, 'train_test')
if os.path.exists(data) == False:
    os.mkdir(data)

# Create new directorys for trains, test, val per class:
train = os.path.join(data, 'train')
test = os.path.join(data, 'test')
#validation = os.path.join(data, 'validation')

folders = ['COVID19', 'NORMAL', 'OPACITY', 'PNEUMONIA']
if os.path.exists(train) == False:
    for folder in folders:
        os.system(f"mkdir -p {train}/{folder}")

if os.path.exists(test) == False:
    for folder in folders:
        os.system(f"mkdir -p {test}/{folder}")

# if os.path.exists(validation) == False:
#     for folder in folders:
#         os.system(f"mkdir -p {validation}/{folder}")

In [86]:
## Check architeture:
!tree -d {cloud_dir}

[01;34m/home/remo/code/victor-ocn/covid19-project/raw_data/cloud[00m
├── [01;34mCOVID-19_Radiography_Dataset[00m
│   ├── [01;34mCOVID[00m
│   ├── [01;34mLung_Opacity[00m
│   ├── [01;34mNormal[00m
│   └── [01;34mViral Pneumonia[00m
├── [01;34mtrain_test[00m
│   ├── [01;34mtest[00m
│   │   ├── [01;34mCOVID19[00m
│   │   ├── [01;34mNORMAL[00m
│   │   ├── [01;34mOPACITY[00m
│   │   └── [01;34mPNEUMONIA[00m
│   └── [01;34mtrain[00m
│       ├── [01;34mCOVID19[00m
│       ├── [01;34mNORMAL[00m
│       ├── [01;34mOPACITY[00m
│       └── [01;34mPNEUMONIA[00m
└── [01;34mxray-dataset-covid-pneumonia[00m
    ├── [01;34mCovid19 (1)[00m
    ├── [01;34mCovid19 (2)[00m
    └── [01;34mPneumonia[00m

20 directories


In [59]:
## Organize dataset to model:
import shutil

for input, output,names in zip([covid, normal, opacity, pneumonia], folders, [covid_names, normal_names, opacity_names, pneumonia_names]):
    ## Split data in train & test (list of names):
    train_names, test_names = train_test_split(names, test_size=0.20, random_state=42)
    for image in train_names:
        ## Copy data to train folders:
        shutil.copyfile(f"{input}/{image}", f"{train}/{output}/{image}")
    for image in test_names:
        ## Copy data to test folders:
        shutil.copyfile(f"{input}/{image}", f"{test}/{output}/{image}")


In [84]:
## Check folders:
print('# Train:')
for folder in folders:
    print(f"{folder}: {len(os.listdir(os.path.join(train, folder)))}")

print('# Test:')
for folder in folders:
    print(f"{folder}: {len(os.listdir(os.path.join(test, folder)))}")

#len(os.listdir(covid))

# Train:
COVID19: 3455
NORMAL: 8153
OPACITY: 4809
PNEUMONIA: 3810
# Test:
COVID19: 864
NORMAL: 2039
OPACITY: 1203
PNEUMONIA: 953
