In [None]:
import matplotlib.pyplot as plt
import os
import cv2
import random
import numpy as np
from PIL import Image

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
project_id = 'mammography-198911'
bucket_name = 'pneumonia'

!gcloud config set project {project_id}

In [None]:
def sample_files(root_dir, num_files=3):
  total_files = 0
  # random sample some images
  for dir_ in dirs:
    if "." not in dir_:
      files = os.listdir(os.path.join(root_dir, dir_))
      total_files += len(files)
      samples = random.sample(files, num_files)

      for image in samples:
        image = plt.imread(os.path.join(root_dir, dir_, image))
        plt.imshow(image)
        plt.title(str(image.shape))
        plt.show()

  print("Total Files:", total_files)
  
def clean_files(root_dir, resize=True):
  dirs = os.listdir(root_dir)

  for dir_ in dirs:
      if "." not in dir_:
          files = os.listdir(os.path.join(root_dir, dir_))
          for file in files:
              image = plt.imread(os.path.join(root_dir, dir_, file))
              # remove B&W images
              if len(image.shape) != 3:
                  os.remove(os.path.join(root_dir, dir_, file))
              # remove images that are too small
              elif image.shape[0] < 190 or image.shape[1] < 160:
                  os.remove(os.path.join(root_dir, dir_, file))
              # remove specific manufactured images that are useless
              #       elif image.shape[0] == 500 and image.shape[1] != 500:
              #         os.remove(os.path.join(root_dir, dir_, file))
              # remove images that are oddly shaped
              elif image.shape[0] >= (image.shape[1] * 1.5):
                  os.remove(os.path.join(root_dir, dir_, file))
              # remove images that are the wrong proportions
              elif image.shape[1] > image.shape[0]:
                  os.remove(os.path.join(root_dir, dir_, file))
              # remove B&W jpegs
              elif np.array_equal(image[:,:,0], image[:,:,1]) or np.array_equal(image[:,:,0], image[:,:,2]) or np.array_equal(image[:,:,1], image[:,:,2]):
                  os.remove(os.path.join(root_dir, dir_, file))
              # remove images that have weird blurring at the top
              elif np.array_equal(image[0,:,:], image[10,:,:]):
                  os.remove(os.path.join(root_dir, dir_, file))
              elif np.array_equal(image[:,0,:], image[:,10,:]):
                  os.remove(os.path.join(root_dir, dir_, file))

              # if the image is still big AND it still exists, resize it to save space
              if os.path.exists(os.path.join(root_dir, dir_, file)) and image.shape[0] > 300:
                new_height = 220
                new_width = int((new_height / image.shape[0]) * image.shape[1])
                image = Image.fromarray(image).resize((new_width, new_height))
                image.save(os.path.join(root_dir, dir_, file))

In [None]:
!wget https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/wiki_crop.tar
!tar -xf wiki_crop.tar

In [None]:
clean_files("wiki_crop")
sample_files("wiki_crop")

In [None]:
!zip -rq wiki_images.zip wiki_crop/
!ls -al

In [None]:
!gsutil cp wiki_images.zip gs://{bucket_name}/wiki_images2.zip

In [None]:
!wget https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/static/imdb_crop.tar
!tar -xf imdb_crop.tar

In [None]:
clean_files("imdb_crop")
sample_files("imdb_crop")

In [None]:
!zip -rq imdb_images.zip imdb_crop/
!ls -al

In [None]:
!gsutil cp imdb_images.zip gs://{bucket_name}/imdb_images3.zip