# Using "numpy.memmap" to map directly to disk instead of storing the array in memory.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import glob
import cv2
import random
import os.path
from datetime import datetime
import pandas as pd

np.random.seed(2016)
random.seed(2016)


## Define image resize

In [None]:
conf = dict()
# Shape of image for CNN (Larger the better, but you need to increase CNN as well)
conf['image_shape'] = (32,32)


## File paths to training and additional samples

In [None]:
# file paths to training and additional samples
print(str(datetime.now()))

filepaths = []
filepaths.append('../input/train/Type_1/')
filepaths.append('../input/train/Type_2/')
filepaths.append('../input/train/Type_3/')
filepaths.append('../input/test/')

print(str(datetime.now()))


## Get file listing

In [None]:
print(str(datetime.now()))

allFiles = []

for i, filepath in enumerate(filepaths):
    files = glob.glob(filepath + '*.jpg')
    allFiles = allFiles + files

print(str(datetime.now()))


## Example of how to use memmap

In [None]:
#random.shuffle(allFiles)
print(str(datetime.now()))

f = cv2.imread(allFiles[0])
f = cv2.resize(f, conf['image_shape'])
f = f.flatten()

#row = len(allFiles)
# limiting to 10 rows so the kaggle kernel doesn't time out on us
row = 10
col = len(f)

f_image='./images.npy'
f_targets='./targets.npy'

if os.path.isfile(f_image):
    # MEMMAP allows you to map to a numpy array directly on disk instead of storing it in memory
    X = np.memmap(f_image, dtype='int', mode='r', shape=(row, col))
    y = np.load(f_targets)
else:
    # MEMMAP allows you to map to a numpy array directly on disk instead of storing it in memory
    X = np.memmap(f_image, dtype='int', mode='w+', shape=(row, col))
    y = []

    # limiting to 10 rows so the kaggle kernel doesn't time out on us
    for i, f in enumerate(allFiles[:10]):
        image = cv2.imread(f)
        image = cv2.resize(image, conf['image_shape'])
        image = image.flatten()

        X[i] = image

        if i < (row - 512): # 512 is the number of test files that do not have labels
            cancer_type = f[41:42]
            if cancer_type == '1':
                mask = 1
            elif cancer_type == '2':
                mask = 2
            else:
                mask = 3

            y.append(mask)
            
    y = np.array(y)
    np.save(f_targets, y)
    
print(str(datetime.now()))
