# Get images from Kaggle

## Install Kaggle API client

In [None]:
%%bash
pip install kaggle

## Move API key to where Kaggle expects it and change file permissions

In [None]:
# Make sure you have your "kaggle.json" saved in the present working dir
%%bash

mkdir /home/ec2-user/.kaggle/
mv /home/ec2-user/SageMaker/kaggle.json /home/ec2-user/.kaggle/
chmod 600 /home/ec2-user/.kaggle/kaggle.json

In [None]:
%%bash

kaggle datasets list --user paultimothymooney --min-size 1

## Download the breast cancer detection dataset

In [None]:
%%bash

kaggle datasets download paultimothymooney/breast-histopathology-images

## Unzip the dataset to the images directory and remove the original ZIP file

In [None]:
%%bash

unzip breast-histopathology-images.zip -d images
rm -rf breast-histopathology-images.zip

## Remove duplicated images

In [None]:
rm -rf images/IDC_*

## Reorganize files in the images directory into two subdirectories, corresponding to the two classes we have (0 and 1). This will be useful when converting images to RecordIO format

In [None]:
%%bash

mkdir images/0
mkdir images/1

In [None]:
import os

In [None]:
for path, subdirs, files in os.walk('images'):
    for name in files:
        filename = os.path.join(path, name)
        if name.endswith('class0.png'):
            destination_class = '0'
        else:
            destination_class = '1'
        os.rename(filename, os.path.join('images', destination_class, name))

In [None]:
%%bash
shopt -s extglob
cd images
rm -rf !("0"|"1")

# Explore the images

## Count the number of images for each class

In [None]:
%%bash
cd images/0
ls -1 | wc -l

In [None]:
%%bash
cd images/1
ls -1 | wc -l

## Show an image with no cancer (class 0)

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [None]:
img = mpimg.imread('images/0/8975_idx5_x2851_y1201_class0.png')
imgplot = plt.imshow(img)
plt.show()

## Show an image with cancer (class 1)

In [None]:
img = mpimg.imread('images/1/9075_idx5_x801_y801_class1.png')
imgplot = plt.imshow(img)
plt.show()

# Convert images to the RecordIO format and upload to S3

## Get the im2rec script from Apache MXNet Github repository

In [None]:
%%bash

wget https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py
chmod +x im2rec.py

## Create the .lst files needed for creating the .rec files for train and test datasets

In [None]:
%%bash

python im2rec.py --list --recursive --test-ratio 0.3 --train-ratio 0.7 images images/

## Create the .rec files for train and test datasets

In [None]:
%%bash

python im2rec.py --num-thread 4 --pass-through images_train.lst images
python im2rec.py --num-thread 4 --pass-through images_test.lst images

## Uploading the train and test .rec files to S3

In [None]:
bucket = "sagemaker-projects-demo"    # Update with your bucket name

In [None]:
!aws s3 cp images_train.rec s3://{bucket}/breast-cancer-detection/input/recordio/train/
!aws s3 cp images_test.rec s3://{bucket}/breast-cancer-detection/input/recordio/test/