In [1]:
import os 
os.environ['KERAS_BACKEND'] = "plaidml.keras.backend"
from tensorflow import keras

In [4]:
from glob import glob
import ast
import cv2
import time
import ast
from tqdm import tqdm
import matplotlib
import matplotlib.pyplot as pltc
import numpy as np
import pandas as pd 
import tensorflow as tf
import matplotlib.pyplot as plt
from dask import bag, threaded

from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.nasnet import NASNetMobile
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K
from tensorflow.keras.applications import MobileNet

BASE_SIZE = 256
DP_DIR = '../input/shuffle_csv/'
INPUT_DIR = '../input/kaggle/'
NCSVS = 100
NCATS = 48

In [8]:
startTime = time.time()

In [7]:
#clean spaces in name
classes_path = os.listdir(INPUT_DIR + 'train_sim_animals/')
classes_path = sorted(classes_path, key=lambda s: s.lower())
class_dict = {x[:-4].replace(" ", "_"):i for i, x in enumerate(classes_path)}
labels = {x[:-4].replace(" ", "_") for i, x in enumerate(classes_path)}

n_labels = len(labels)
print("Number of labels: {}".format(n_labels))

fileList = glob(INPUT_DIR + "train_sim_animals/*.csv")     

n_files = n_labels #number of csv files same as labels due to stupid structure.

#time is sacred HARDCODED FOR THE COMP
#n_records = 49707919
size = 80

#for f in fileList: saving time
#    n_records += sum(1 for line in open(f))
#print("Number of records: {}".format(n_records))

Number of labels: 48
Number of records: 49707919


In [19]:
def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

In [20]:
# to image from stroke
def drawing_to_np(drawing, shape=(size, size)):
    drawing = eval(drawing)
    fig, ax = plt.subplots()
    for x,y in drawing:
        ax.plot(x, y, marker='.')
        ax.axis('off')
    fig.canvas.draw()
    # Convert images to numpy arrat
    np_drawing = np.array(fig.canvas.renderer._renderer)
    plt.close(fig)
    img = cv2.resize(np_drawing, shape)
    img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    img_expanded = img_gray[:, :, np.newaxis]
    return img_expanded

In [21]:
def draw_cv2_reshape_normalized(raw_strokes, size=size, lw=6):
    img = np.zeros((BASE_SIZE, BASE_SIZE), np.uint8)
    for stroke in raw_strokes:
        for i in range(len(stroke[0]) - 1):
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]), (stroke[0][i + 1], stroke[1][i + 1]), 255, lw)

    img = cv2.resize(img, (size, size))
    img = img / 255.
    img = img[:, :, np.newaxis]
    return img
    
    

In [22]:
def draw_cv2(raw_strokes, size=256, lw=6):
    img = np.zeros((BASE_SIZE, BASE_SIZE), np.uint8)
    for stroke in raw_strokes:
        for i in range(len(stroke[0]) - 1):
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]), (stroke[0][i + 1], stroke[1][i + 1]), 255, lw)
    if size != BASE_SIZE:
        return cv2.resize(img, (size, size))
    else:
        return img

#ADD DATA AUGMENTATION TO BOOST
def image_generator(size, batchsize, ks, lw=6):
    while True:
        for k in np.random.permutation(ks):
            filename = os.path.join(DP_DIR, 'train_k{}.csv.gz'.format(k))
            for df in pd.read_csv(filename, chunksize=batchsize):
                df['drawing'] = df['drawing'].apply(ast.literal_eval)
                x = np.zeros((len(df), size, size))
                for i, raw_strokes in enumerate(df.drawing.values):
                    x[i] = draw_cv2(raw_strokes, size=size, lw=lw)
                x = x / 255.
                x = x.reshape((len(df), size, size, 1)).astype(np.float32)
                y = tf.keras.utils.to_categorical(df.y, num_classes=NCATS)
                yield x, y

def df_to_image_array(df, size=size, lw=6):
    df['drawing'] = df['drawing'].apply(ast.literal_eval)
    x = np.zeros((len(df), size, size))
    for i, raw_strokes in enumerate(df.drawing.values):
        x[i] = draw_cv2(raw_strokes, size=size, lw=lw)
    x = x / 255.
    x = x.reshape((len(df), size, size, 1)).astype(np.float32)
    return x



In [23]:
STEPS = 500
batchsize = 512
epochs = 15

In [24]:
valid_df = pd.read_csv(os.path.join(DP_DIR, 'train_k{}.csv.gz'.format(NCSVS - 1)), nrows=30000)
x_valid = df_to_image_array(valid_df, size)
y_valid = tf.keras.utils.to_categorical(valid_df.y, num_classes=NCATS)
print(x_valid.shape, y_valid.shape)
print('Validation array memory {:.2f} GB'.format(x_valid.nbytes / 1024.**3 ))

(14085, 80, 80, 1) (14085, 340)
Validation array memory 0.34 GB


In [25]:
train_datagen = image_generator(size=size, batchsize=batchsize, ks=range(NCSVS - 1))

In [26]:
base_model = MobileNet(input_shape=(size, size, 1), include_top=False, weights=None, classes=n_labels)

# add a global spatial average pooling layer
x = base_model.output
x = Flatten()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='relu')(x)
predictions = Dense(n_labels, activation='softmax')(x)
# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

In [27]:
model.compile(optimizer=Adam(lr=1e-4), loss='categorical_crossentropy',
              metrics=[categorical_crossentropy, categorical_accuracy, top_3_accuracy])

In [28]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 80, 80, 1)]       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 40, 40, 32)        288       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 40, 40, 32)        128       
_________________________________________________________________
conv1_relu (ReLU)            (None, 40, 40, 32)        0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 40, 40, 32)        288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 40, 40, 32)        128       
_________________________________________________________________
conv_dw_1_relu (ReLU)        (None, 40, 40, 32)        0   

In [29]:
callbacks = [
    ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.5, patience=5,
                      min_delta=0.005, mode='max', cooldown=3, verbose=1)
]

hist = model.fit_generator(
    train_datagen, steps_per_epoch=STEPS, epochs=epochs, verbose=1,
    validation_data=(x_valid, y_valid),
    callbacks = callbacks
)

Epoch 1/15


InvalidArgumentError:  logits and labels must be broadcastable: logits_size=[512,48] labels_size=[512,340]
	 [[node categorical_crossentropy/softmax_cross_entropy_with_logits (defined at <ipython-input-29-3ec8a0336d0b>:9) ]] [Op:__inference_train_function_28146]

Function call stack:
train_function


In [None]:
def gen_graph(history, title):
    plt.plot(history.history['categorical_accuracy'])
    plt.plot(history.history['val_categorical_accuracy'])
    plt.plot(history.history['top_3_accuracy'])
    plt.plot(history.history['val_top_3_accuracy'])
    plt.title('Accuracy ' + title)
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation', 'Test top 3', 'Validation top 3'], loc='upper left')
    plt.show()
    plt.plot(history.history['categorical_crossentropy'])
    plt.plot(history.history['val_categorical_crossentropy'])
    plt.title('Loss ' + title)
    plt.ylabel('MLogLoss')
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

In [None]:
#plot
gen_graph(hist, 
              "Simple net lul")

In [None]:
pred_results = []
chunksize = 10000
reader = pd.read_csv(INPUT_DIR + 'test_simplified.csv', chunksize=chunksize)
for chunk in tqdm(reader):
    imgs = df_to_image_array(chunk)
    pred = model.predict(imgs, verbose=1)
    top_3 =  np.argsort(-pred)[:, 0:3]  
    pred_results.append(top_3)
print("Finished test predictions...")


In [None]:
#prepare data for saving
reverse_dict = {v: k for k, v in class_dict.items()}
pred_results = np.concatenate(pred_results)
print("Finished data prep...")


In [None]:
preds_df = pd.DataFrame({'first': pred_results[:,0], 'second': pred_results[:,1], 'third': pred_results[:,2]})
preds_df = preds_df.replace(reverse_dict)

preds_df['words'] = preds_df['first'] + " " + preds_df['second'] + " " + preds_df['third']

sub = pd.read_csv(INPUT_DIR + 'sample_submission.csv', index_col=['key_id'])
sub['word'] = preds_df.words.values
sub.to_csv('1class_per_label_proto.csv')
sub.head()

In [None]:
endTime = time.time()
print(endTime - startTime)