# 

In [1]:
import os
import numpy as np
import pandas as pd
from google.cloud import storage

In [2]:
# Instantiates a client
storage_client = storage.Client()

In [3]:
# Define bucket name: change this to your own
bucket_name = 'pneumonia-chest-x-ray'

In [5]:
# Get GCS bucket
bucket = storage_client.get_bucket(bucket_name)

In [50]:
# Retrieve blobs in the train dataset
blobs = bucket.list_blobs(prefix='chest_xray/chest_xray/train/')

In [51]:
# Retrieve list of blobs
blob_list = []
for blob in blobs:
    blob_list.append(blob.name)

In [52]:
len(blob_list)

5219

In [53]:
print(blob_list[0])
print(blob_list[1])

chest_xray/chest_xray/train/.DS_Store
chest_xray/chest_xray/train/NORMAL/.DS_Store


In [54]:
# Remove .DS_Store from list created by the datasource system
# We don't mind doing this O(n) operation because the list size is manageable.
# For larger datasets, it may be better to simply delete this file from Google Cloud Storage.

# Append the bucket_prefix to the object file path

i, length = 0, len(blob_list)
bucket_prefix = 'gs://pneumonia-chest-x-ray/'
while i < length:
    if '.DS_Store' in blob_list[i]:
        # remove element with .DS_Store
        blob_list.remove(blob_list[i])
        # readjust list length
        length -= 1
        # run loop again at current index
        continue
    else:
        blob_list[i] = ''.join([bucket_prefix, blob_list[i]])
    i+=1

In [55]:
print(blob_list[0])
print(blob_list[1])

gs://pneumonia-chest-x-ray/chest_xray/chest_xray/train/NORMAL/IM-0115-0001.jpeg
gs://pneumonia-chest-x-ray/chest_xray/chest_xray/train/NORMAL/IM-0117-0001.jpeg


In [56]:
# convert to Pandas DataFrame
data_pd = pd.DataFrame(np.array(blob_list))

In [57]:
data_pd.to_csv("data.csv", header=None, index=None)

In [58]:
# Upload data.csv to Google Cloud Storage
output_blob = bucket.blob('data.csv')

In [59]:
output_blob.upload_from_filename('data.csv')

# Finish