# CNN

## Setup

In [183]:
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

from numpy.random import seed

import os, datetime
import numpy as np
import pandas as pd
import json
import re 

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras import models
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model, Model
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19
from keras.applications.inception_v3 import preprocess_input

from tensorflow.keras.utils import plot_model
import tensorflow.keras as K

import logging, os 
import random
import tensorflow as tf
import matplotlib.pyplot as plt

import config as cf

# Set seeds. Note that using a GPU can still introduce randomness.
# (also not taking into account tensorflow randomness)
seed(42)

ModuleNotFoundError: No module named 'my_classes'

## Functions

In [204]:
SURVEY_NAME = 'DHS'
SATELLITE = 'l8'
BAND = 'B1'
TARGET_VAR = 'wealth_index'

In [205]:
# Load survey data
survey_df = pd.read_csv(os.path.join(cf.DROPBOX_DIRECTORY, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_socioeconomic.csv'))

# Remove if target variable is NA
survey_df = survey_df.dropna(axis=0, subset=[TARGET_VAR])

# To integer
target_list = np.round(survey_df[TARGET_VAR]).tolist()
survey_df[TARGET_VAR] = [int(x) for x in target_list]
survey_df[TARGET_VAR].value_counts()

2    12935
3    11016
4     9534
5     4166
1     4057
Name: wealth_index, dtype: int64

In [206]:
# List of npy files
NPY_PATH = os.path.join(cf.GOOGLEDRIVE_DIRECTORY, 
             'Data', 
             SURVEY_NAME, 
             'FinalData', 
             'Individual Datasets',
            'cnn_' + SATELLITE,
             'npy')

NPY_FILES = os.listdir(NPY_PATH)
reg = re.compile(r'^' + BAND + '_')                  
NPY_FILES = list(filter(reg.search, NPY_FILES))  

In [207]:
# Subset survey
uids = [file.replace('.npy', '').replace(BAND + '_', '') for file in NPY_FILES]

survey_df = survey_df[survey_df['uid'].isin(uids)]
survey_df['band_uid'] = BAND + '_' + survey_df['uid']

In [208]:
# Survey to train/test

survey_df['traintest'] = np.random.choice(a = ['train', 'test'], 
                                      p = [0.8, 0.2],
                                      size = survey_df.shape[0])

In [209]:
# Partition Dictionary
train_uids = survey_df[survey_df.traintest == 'train']['band_uid'].tolist()
test_test = survey_df[survey_df.traintest == 'test']['band_uid'].tolist()

partition = {'train': train_uids, 
             'test': test_test}

In [210]:
survey_df_train = survey_df[survey_df.traintest == 'train']
labels = [{key: val} for key, val in zip(survey_df.band_uid, survey_df.wealth_index)]

In [214]:
np_example = np.load(os.path.join(NPY_PATH, 'BRGB_PK201700000001.npy'))
np_example.shape

(224, 224, 1)

## CNN Model

In [194]:
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32), n_channels=1,
                 n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load(os.path.join(NPY_PATH, ID + '.npy'))

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [195]:
height = 224
width = 224
num_classes = 5

# Parameters
params = {'dim': (32,32,32),
          'batch_size': 64,
          'n_classes': 6,
          'n_channels': 1,
          'shuffle': True}

# Generators
training_generator = DataGenerator(partition['train'], labels, **params)
validation_generator = DataGenerator(partition['test'], labels, **params)

# Design model
#model = Sequential()
#[...] # Architecture
#x = Dense(100, activation='relu', name='fc1')(x)
#model.compile()

#### Base model
input_shape = (height, width, 3)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape, pooling = "max")

for layer in base_model.layers:
    layer.trainable = False

#### Model Customization
# We take the last layer of our the model and add it to our classifier
last = base_model.layers[-1].output
x = Flatten()(last)
x = Dense(100, activation='relu', name='fc1')(x)
x = Dropout(0.3)(x)
x = Dense(num_classes, activation='softmax', name='predictions')(x)
model = Model(base_model.input, x)
# We compile the model
model.compile()

# Train model on dataset
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    workers=6)


ValueError: could not broadcast input array from shape (224,224,9) into shape (32,32,32,1)