In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras,cv2,os
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, BatchNormalization, Activation
from keras.layers import Conv2D, MaxPool2D

from tqdm import tqdm_notebook,trange
from glob import glob 

In [None]:
path = "/home/kagglecomp/hcd" 
traindata = path + 'train/'
testdata = path + 'test/'

submitname = "Submission_009.csv"

In [None]:
df = pd.DataFrame({'path': glob(os.path.join(traindata,'*.tif'))})  
labels = pd.read_csv(path+"train_labels.csv") 
df = df.merge(labels, on = "id")
df.head(3) 

def load_data(N,df):  
    X = np.zeros([N,96,96,3],dtype=np.uint8)   
    for i, row in tqdm_notebook(df.iterrows(), total=N):
        if i == N:
            break
        X[i] = cv2.imread(row['path'])
          
    return X,y

N=20000
X,y = load_data(N=N,df=df) 

fig = plt.figure(figsize=(10, 4), dpi=150)
np.random.seed(100) 
for plotNr,idx in enumerate(np.random.randint(0,N,8)):
    ax = fig.add_subplot(2, 8//2, plotNr+1, xticks=[], yticks=[])  
    plt.imshow(X[idx])  
    ax.set_title('Label: ' + str(y[idx]))  


fig = plt.figure(figsize=(4, 2),dpi=150)
plt.bar([1,0], [(y==0).sum(), (y==1).sum()]);  
plt.xticks([1,0],["Negative (N={})".format((y==0).sum()),"Positive (N={})".format((y==1).sum())]);
plt.ylabel("# of smps")

positive_smps = X[y == 1]
negative_smps = X[y == 0]

nr_of_bins = 256 
fig,axs = plt.subplots(4,2,sharey=True,figsize=(8,8),dpi=150)
 
 
axs[0,0].set_title("Positive smps (N =" + str(positive_smps.shape[0]) + ")");
axs[0,1].set_title("Negative smps (N =" + str(negative_smps.shape[0]) + ")");
axs[0,1].set_ylabel("Red",rotation='horizontal',labelpad=35,fontsize=12)
axs[1,1].set_ylabel("Green",rotation='horizontal',labelpad=35,fontsize=12)
axs[2,1].set_ylabel("Blue",rotation='horizontal',labelpad=35,fontsize=12)
axs[3,1].set_ylabel("RGB",rotation='horizontal',labelpad=35,fontsize=12)
for i in range(4):
    axs[i,0].set_ylabel("Relative frequency")
axs[3,0].set_xlabel("Pixel value")
axs[3,1].set_xlabel("Pixel value")
fig.tight_layout()
 
 
N = df["path"].size  
X,y = load_data(N=N,df=df)

training_portion = 0.8 
split_idx = int(np.round(training_portion * y.shape[0]))  

np.random.seed(42)  
 
idx = np.arange(y.shape[0])
np.random.shuffle(idx)
X = X[idx]
y = y[idx] 
kernel_size = (3,3)
pool_size= (2,2)
first_filters = 32
second_filters = 64
third_filters = 128

 
dropout_conv = 0.3
dropout_dense = 0.5
 
model = Sequential()
 
model.add(Conv2D(first_filters, kernel_size, input_shape = (96, 96, 3)))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Conv2D(first_filters, kernel_size, use_bias=False)) 
model.add(Activation("relu"))
model.add(MaxPool2D(pool_size = pool_size)) 
model.add(Dropout(dropout_conv))
 
model.add(Conv2D(second_filters, kernel_size, use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Conv2D(second_filters, kernel_size, use_bias=False)) 
model.add(Activation("relu"))
model.add(MaxPool2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))
 
model.add(Conv2D(third_filters, kernel_size, use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Conv2D(third_filters, kernel_size, use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPool2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))
 
model.add(Flatten())
model.add(Dense(256, use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(dropout_dense))
 
model.add(Dense(1, activation = "sigmoid"))

batch_size = 50
 
model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.Adam(0.001), 
              metrics=['accuracy'])

epochs = 3 
for epoch in range(epochs): 
    iterations = np.floor(split_idx / batch_size).astype(int)  
    with trange(iterations) as t:  
        for i in t:
            start_idx = i * batch_size  
            x_batch = X[start_idx:start_idx+batch_size]  
            y_batch = y[start_idx:start_idx+batch_size]  

            metrics = model.train_on_batch(x_batch, y_batch)  

            loss = loss + metrics[0]  
            acc = acc + metrics[1]  
            t.set_description('Running training epoch ' + str(epoch))  
            t.set_postfix(loss="%.2f" % round(loss / (i+1),2),acc="%.2f" % round(acc / (i+1),2))  

 
iterations = np.floor((y.shape[0]-split_idx) / batch_size).astype(int)  
loss,acc = 0,0  
with trange(iterations) as t:  
    for i in t:
        start_idx = i * batch_size  
        x_batch = X[start_idx:start_idx+batch_size]  
        y_batch = y[start_idx:start_idx+batch_size] 
        
        metrics = model.test_on_batch(x_batch, y_batch)  
        
        loss = loss + metrics[0] 
        acc = acc + metrics[1]  
        t.set_postfix(loss="%.2f" % round(loss / (i+1),2),acc="%.2f" % round(acc / (i+1),2))
        
print("Validation loss:",loss / iterations) 
X = None
y = None 



base_test_dir = path + 'test/' 
test_files = glob(os.path.join(base_test_dir,'*.tif')) 
submission = pd.DataFrame() 
file_batch = 5000  
max_idx = len(test_files) 
for idx in range(0, max_idx, file_batch): 
    print("Indexes: %i - %i"%(idx, idx+file_batch))
    test_df = pd.DataFrame({'path': test_files[idx:idx+file_batch]}) 
    test_df['id'] = test_df.path.map(lambda x: x.split('/')[3].split(".")[0])  
    test_df['image'] = test_df['path'].map(cv2.imread)  
    K_test = np.stack(test_df["image"].values)  
    predictions = model.predict(K_test,verbose = 1)  
    test_df['label'] = predictions  
    submission = pd.concat([submission, test_df[["id", "label"]]])
submission.head()  

 
submission.to_csv(submitname, index = False, header = True)