## Two independed models for energy and class prediction

Learn two models separately and make prediction

In [44]:
#!pip install keras==2.0.8

In [2]:
#!pip install tensorflow==1.13.1

In [3]:
import warnings
warnings.filterwarnings("ignore")
import os, sys
from IPython.display import display
from IPython.display import Image as _Imgdis
from PIL import Image, ImageFilter
import numpy as np
from time import time, sleep

import numpy as np 
import pandas as pd

import random
import tensorflow as tf
from scipy import ndimage
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

from random import shuffle
import cv2


from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

from sklearn.model_selection import train_test_split

from math import sqrt

from keras import backend as K




Using TensorFlow backend.


In [4]:
tf.random.set_random_seed(1)
np.random.seed(37)
random.seed(1254)

os.environ['PYTHONHASHSEED']=str(42)


### Get list of files

In [5]:
folder1 = "./data/idao_dataset/train/ER"
folder2 = "./data/idao_dataset/train/NR"

imagefiles = [f for f in os.listdir(folder1) if os.path.isfile(os.path.join(folder1, f))]
imagefiles_2 = [f for f in os.listdir(folder2) if os.path.isfile(os.path.join(folder2, f))]


imagefiles.extend(imagefiles_2)


In [6]:
def get_point_and_int(filename):
    
    """
    This function finds the brightest area of images for cropping (it was an experiment).For train data
    """
    
    if "ER" in filename:
        boolfolder = True
    else:
        boolfolder = False

    if boolfolder:
        image = cv2.imread(folder1 + "/" + filename)
    else:
        image = cv2.imread(folder2 + "/" + filename)
    orig = image.copy()
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    (minVal, maxVal, minLoc, maxLoc) = cv2.minMaxLoc(gray)


    gray = cv2.GaussianBlur(gray, (31, 31), 0)
    (minVal, maxVal, minLoc, maxLoc) = cv2.minMaxLoc(gray)
    
    return maxVal, maxLoc


def get_point_and_int_inf(filename, link):
    
    """
    This function finds the brightest area of images for cropping (it was an experiment).For test data/inference
    """
    
    image = cv2.imread(link + "/" + filename)

    orig = image.copy()
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    (minVal, maxVal, minLoc, maxLoc) = cv2.minMaxLoc(gray)

    gray = cv2.GaussianBlur(gray, (31, 31), 0)
    (minVal, maxVal, minLoc, maxLoc) = cv2.minMaxLoc(gray)
    
    return maxVal, maxLoc


def sly_round(x):
    """
    Round to the nearest energy class
    """
    values = [1, 3, 6, 10, 20, 30]
    
    delta_values = [abs(x - i) for i in values]
    return values[delta_values.index(min(delta_values))]
    
def math_round(x):
    return sly_round(x)

### From imgs to numpy array

In [7]:
train_files = []
y_train = []
y_train_class = []
i = 0

for _file in imagefiles:
    train_files.append(_file)
    label_in_file = _file.replace("__", "_").replace("_He", "").split("_")
    y_train.append(int(label_in_file[5]))
    y_train_class.extend([0 if label_in_file[4] =="NR" else 1 ])

print("Files in train_files: {0}".format(len(train_files)))


image_width = 110
image_height = 110

channels = 1

dataset = np.ndarray(shape=(len(train_files), image_height, image_width, channels),
                     dtype=np.float32)

i = 0

for _file in train_files:
    
    if "ER" in _file:
        img = load_img(folder1 + "/" + _file) 
    elif "NR" in _file:
        img = load_img(folder2 + "/" + _file) 
        
    # Get the brightest area of image for cropping (not used now, just one of experiments)
    #inten, (w, h) = get_point_and_int(_file)

    
    #load central part of image
    img = img.crop((160, 160, 416, 416))
    img.thumbnail((110, 110))

    # To numpy array
    
    x = img_to_array(img)[:,:,0]  
    
    x =np.expand_dims(x, axis=2)

    
    x = (x - 128.0) / 128.0
    dataset[i] = x
    i += 1
    if i % 1000 == 0:
        print("{0} images in array".format(i))
print("All images to array!")

Files in train_files: 13536
1000 images in array
2000 images in array
3000 images in array
4000 images in array
5000 images in array
6000 images in array
7000 images in array
8000 images in array
9000 images in array
10000 images in array
11000 images in array
12000 images in array
13000 images in array
All images to array!


### Splitting into training, validation and test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(dataset, y_train, test_size=0.25, random_state=33)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.95, random_state=33)

print("Train set size: {0}, Val set size: {1}, Test set size: {2}".format(len(X_train), len(X_val), len(X_test)))

Train set size: 10152, Val set size: 3215, Test set size: 169


In [9]:
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [10]:
datagen = ImageDataGenerator(
        rotation_range=90,
        shear_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

datagen.fit(X_train)


val_datagen = ImageDataGenerator(
        rotation_range=90,
        shear_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

val_datagen.fit(X_val)

### The First step CNN (Regresion)

In [11]:
def get_regression_model():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), input_shape=(110, 110, 1), activation='relu', data_format='channels_last'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(32, (3, 3), activation='relu' ))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten())
    model.add(Dense(64))
    model.add(Activation('linear'))
    model.add(Dropout(0.15))
    model.add(Dense(1))
    
    return model

In [12]:
model  = get_regression_model()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [15]:
# stop if no progress more

early_stop = EarlyStopping(monitor='val_loss', patience=7, verbose=1, mode='auto')

#save checkpoints

mcp_save = ModelCheckpoint('regr_model_best_st2_aug.h5', save_best_only=True, monitor='val_loss', mode='min')

#sheduler for decreasing lr
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, epsilon=1e-4, mode='min')

In [16]:
model.compile(loss='mae', optimizer='adam')

model.fit_generator(datagen.flow(X_train, y_train, batch_size=64), 
                    samples_per_epoch=len(X_train),  
                    nb_epoch= 50, 
                    validation_data=val_datagen.flow(X_val, y_val, batch_size=16),
                    nb_val_samples=len(X_val),
                    verbose=1,
                    callbacks=[early_stop, mcp_save, reduce_lr_loss])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 00020: reducing learning rate to 0.0005000000237487257.
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 00033: reducing learning rate to 0.0002500000118743628.
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 00041: reducing learning rate to 0.0001250000059371814.
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 00047: reducing learning rate to 6.25000029685907e-05.
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fd9ac0f3f90>

### Test prediction and generation a part of submission.

In [17]:
model.load_weights("regr_model_best_st2_aug.h5")

In [18]:
predictions = model.predict(X_test)

print("\n Mean absolute error on test set: {0:.2f} ".format(model.evaluate(X_test, y_test)))

 Mean absolute error on test set: 0.47 


In [19]:
def get_data(link):
    
    """
    Loading data from test
    """
    
    imagefiles = [f for f in os.listdir(link) if os.path.isfile(os.path.join(link, f))]

    print("Files in directory: {}".format(len(imagefiles)))

    image_width = 110
    image_height = 110

    channels = 1

    dataset = np.ndarray(shape=(len(imagefiles), image_height, image_width, channels),
                         dtype=np.float32)

    i = 0
    for _file in imagefiles:

        img = load_img(link + "/" + _file) 

        #inten, (w, h) = get_point_and_int_inf(_file, link)

        img = img.crop((160, 160, 416, 416))

        img.thumbnail((110, 110))

        x = img_to_array(img)[:,:,0]  


        x =np.expand_dims(x, axis=2)

        # Normalize
        x = (x - 128.0) / 128.0
        dataset[i] = x
        i += 1
        if i % 1000 == 0:
            print("{0} images to array".format(i))
    print("All images to array!")
    
    return imagefiles, dataset

In [20]:
link_private = "./data/idao_dataset/private_test"
link_public = "./data/idao_dataset/public_test"


list_files_private, dataset_private = get_data(link_private)
list_files_public, dataset_public = get_data(link_public)
    

Files in directory: 15058
1000 images to array
2000 images to array
3000 images to array
4000 images to array
5000 images to array
6000 images to array
7000 images to array
8000 images to array
9000 images to array
10000 images to array
11000 images to array
12000 images to array
13000 images to array
14000 images to array
15000 images to array
All images to array!
Files in directory: 1502
1000 images to array
All images to array!


In [21]:
predictions_private = model.predict(dataset_private)
predictions_public = model.predict(dataset_public)

In [22]:
data_pri = {'id': [i.replace(".png", "") for i in list_files_private],
        'classification_predictions': 1, 
        'regression_predictions': [math_round(i[0]) for i in predictions_private ],
        'is_public': 0,
        'orig':[i[0] for i in predictions_private ]
        
       } 
df_pri = pd.DataFrame.from_dict(data_pri)


data_pub = {'id': [i.replace(".png", "") for i in list_files_public],
        'classification_predictions': 1, 
        'regression_predictions': [math_round(i[0]) for i in predictions_public ],
        'is_public': 1,
        'orig':[i[0] for i in predictions_public ] 
        
       }

df_pub = pd.DataFrame.from_dict(data_pub)

In [23]:
df_pub.head(n=5)

Unnamed: 0,id,classification_predictions,regression_predictions,is_public,orig
0,aa9c74d71c591eedc4ce370c3b144eb9a23c8ba0,1,1,1,1.014202
1,0a02a0f55e306cf2a181fea35451611075a6f729,1,20,1,20.636044
2,488caca71da221550bddeeb9b1cbbf5de348d537,1,20,1,19.560398
3,a01b6a560e0bdc54394289952a70058b9d849a82,1,20,1,19.981997
4,7b17af34c2fd0825819234e0bfc69dccd934809b,1,3,1,3.074516


In [24]:
df_pub.append(df_pri).drop(['is_public', 'orig'], axis=1).to_csv("sum1stage.csv", index=None)

## Classification

In [25]:
X_train, X_test, y_train, y_test = train_test_split(dataset, y_train_class, test_size=0.23, random_state=33)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.8, random_state=33)

print("Train set size: {0}, Val set size: {1}, Test set size: {2}".format(len(X_train), len(X_val), len(X_test)))

Train set size: 10422, Val set size: 2492, Test set size: 622


In [26]:
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [27]:
datagen = ImageDataGenerator(
        rotation_range=90,
        shear_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

datagen.fit(X_train)

val_datagen = ImageDataGenerator(
        rotation_range=90,
        shear_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

val_datagen.fit(X_val)

In [28]:
def auc(y_true, y_pred):
    
    """
    AUC for Keras
    """
    
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [29]:
def get_classification_model():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), input_shape=(110, 110, 1), activation='relu', data_format='channels_last'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(32, (3, 3), activation='relu' ))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten()) 
    model.add(Dense(64))
    model.add(Activation('linear'))
    model.add(Dropout(0.15))
    model.add(Dense(1, activation='sigmoid'))    
    return model

In [30]:
model = get_classification_model()

In [31]:
early_stop = EarlyStopping(monitor='val_loss', patience=7, verbose=1, mode='auto')

mcp_save = ModelCheckpoint('classif_model_best_st2_aug.h5', save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, epsilon=1e-4, mode='min')


In [32]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])

model.fit_generator(datagen.flow(X_train, y_train, batch_size=64), 
                    samples_per_epoch=len(X_train),  
                    nb_epoch=50, 
                    validation_data=val_datagen.flow(X_val, y_val, batch_size=16),
                    nb_val_samples=len(X_val),
                    verbose=1,
                    callbacks=[early_stop, mcp_save, reduce_lr_loss])

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 00009: reducing learning rate to 0.0005000000237487257.
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 00017: reducing learning rate to 0.0002500000118743628.
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 00024: reducing learning rate to 0.0001250000059371814.
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 00029: reducing learning rate to 6.25000029685907e-05.
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 00032: reducing learning rate to 3.125000148429535e-05.
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 00037: reducing learning rate to 1.5625000742147677e-05.
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 4

<keras.callbacks.History at 0x7fd9f9e8d050>

In [33]:
predictions = model.predict(X_test)

print("AUC: {0:.2f}".format(model.evaluate(X_test, y_test)[1]))



In [34]:
list_files_private, dataset_private = get_data(link_private)
list_files_public, dataset_public = get_data(link_public)

Files in directory: 15058
1000 images to array
2000 images to array
3000 images to array
4000 images to array
5000 images to array
6000 images to array
7000 images to array
8000 images to array
9000 images to array
10000 images to array
11000 images to array
12000 images to array
13000 images to array
14000 images to array
15000 images to array
All images to array!
Files in directory: 1502
1000 images to array
All images to array!


In [35]:
model.load_weights("classif_model_best_st2_aug.h5")

In [36]:
predictions_private = model.predict(dataset_private)
predictions_public = model.predict(dataset_public)

In [37]:
data_pri_class = {'id': [i.replace(".png", "") for i in list_files_private],
        'classification_predictions': [i[0] for i in  predictions_private], #[1 if i[0] >THRESHOLD   else 0 for i in predictions_private], 
        'regression_predictions': 1, 
                  'is_public': 0
        
       } 
df_pri_class = pd.DataFrame.from_dict(data_pri_class)


data_pub_class = {'id': [i.replace(".png", "") for i in list_files_public],
        'classification_predictions': [i[0] for i in predictions_public],#[1 if i[0] >THRESHOLD  else 0 for i in predictions_public],  
        'regression_predictions': 1,
                  'is_public': 1
        
       }


df_pub_class = pd.DataFrame.from_dict(data_pub_class)

In [38]:
df_stage_2 = df_pri_class.append(df_pub_class)

In [39]:
df_stage_2.to_csv("sum2stage.csv", index=None)

## Merge

In [40]:
df_stage_1 = pd.read_csv("sum1stage.csv")
df_stage_2 = pd.read_csv("sum2stage.csv")


In [41]:
df_final = pd.merge(
         df_stage_2[['id', 'classification_predictions']], df_stage_1[['id', 'regression_predictions']], how="left", 
         left_on="id", right_on ='id')

In [42]:
df_final.head()

Unnamed: 0,id,classification_predictions,regression_predictions
0,69de1a8b9c376ed67adc8913e3d79140480e2b60,1.0,6
1,07d6bc4eaf59ecdb54a77ac3ae7cc3e70523503b,0.999954,3
2,bc4f06d7e104d67b81346f9a15f40e75decfe129,0.998291,3
3,8e1b23a7ba7f13aa0a2e0adaaed4232bea9ee13d,0.02024,1
4,06bb5a1992e5c2bbd54530e15d8b4f6abf00746e,0.719759,10


In [43]:
df_final.to_csv("sumallstage.csv", index=None)