In [None]:
#Import libraries
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import math
import gc
import cv2
import pickle

In [None]:
#View data files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Preprocessing functions. Credit to user iafoss
HEIGHT = 137
WIDTH = 236
SIZE = 128

def bbox(img):
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.where(rows)[0][[0, -1]]
    cmin, cmax = np.where(cols)[0][[0, -1]]
    return rmin, rmax, cmin, cmax

def crop_resize(img0, size=SIZE, pad=16):
    ymin,ymax,xmin,xmax = bbox(img0[5:-5,5:-5] > 80)
    xmin = xmin - 13 if (xmin > 13) else 0
    ymin = ymin - 10 if (ymin > 10) else 0
    xmax = xmax + 13 if (xmax < WIDTH - 13) else WIDTH
    ymax = ymax + 10 if (ymax < HEIGHT - 10) else HEIGHT
    img = img0[ymin:ymax,xmin:xmax]
    img[img < 28] = 0
    lx, ly = xmax-xmin,ymax-ymin
    l = max(lx,ly) + pad
    img = np.pad(img, [((l-ly)//2,), ((l-lx)//2,)], mode='constant')
    return cv2.resize(img,(size,size))

In [None]:
#Create df of labels
trainlbl = pd.read_csv('/kaggle/input/bengaliai-cv19/train.csv')
len(trainlbl['consonant_diacritic'].unique())

In [None]:
#Preprocess training data
for i in range(3,4):
    data = pd.read_parquet('/kaggle/input/bengaliai-cv19/train_image_data_' + str(i) + '.parquet')
    data = 255 - data.iloc[:, 1:].values.reshape(-1, HEIGHT, WIDTH).astype(np.uint8)
    resized = []
    for idx in range(len(data)):
        img = (data[idx]*(255.0/data[idx].max())).astype(np.uint8)
        resized.append(crop_resize(img))
    del data
    gc.collect()
    train = np.array(resized).reshape(len(resized),SIZE,SIZE,1)
    del resized
    gc.collect()
    with open('train_pre_' + str(i), 'wb') as gen_save:
        pickle.dump(train, gen_save)

In [None]:
#Model for lbl1
densenodes = 128
kernsize = 3
poolsize = 2

model1 = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, (kernsize,kernsize), padding='same', activation=tf.nn.relu, input_shape=(SIZE,SIZE,1)),
tf.keras.layers.MaxPooling2D((poolsize, poolsize), strides=2),
tf.keras.layers.Conv2D(64, (kernsize,kernsize), padding='same', activation=tf.nn.relu, input_shape=(SIZE,SIZE,1)),
tf.keras.layers.MaxPooling2D((poolsize, poolsize), strides=2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(densenodes, activation=tf.nn.relu),
tf.keras.layers.Dense(168,  activation=tf.nn.softmax)
])

model1.compile(optimizer=eval('tf.keras.optimizers.Adam(lr=0.001)'), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
#Model for lbl2
densenodes = 128
kernsize = 3
poolsize = 2

model2 = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, (kernsize,kernsize), padding='same', activation=tf.nn.relu, input_shape=(SIZE,SIZE,1)),
tf.keras.layers.MaxPooling2D((poolsize, poolsize), strides=2),
tf.keras.layers.Conv2D(64, (kernsize,kernsize), padding='same', activation=tf.nn.relu, input_shape=(SIZE,SIZE,1)),
tf.keras.layers.MaxPooling2D((poolsize, poolsize), strides=2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(densenodes, activation=tf.nn.relu),
tf.keras.layers.Dense(11,  activation=tf.nn.softmax)
])

model2.compile(optimizer=eval('tf.keras.optimizers.Adam(lr=0.001)'), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
#Model for lbl3
densenodes = 128
kernsize = 3
poolsize = 2

model3 = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, (kernsize,kernsize), padding='same', activation=tf.nn.relu, input_shape=(SIZE,SIZE,1)),
tf.keras.layers.MaxPooling2D((poolsize, poolsize), strides=2),
tf.keras.layers.Conv2D(64, (kernsize,kernsize), padding='same', activation=tf.nn.relu, input_shape=(SIZE,SIZE,1)),
tf.keras.layers.MaxPooling2D((poolsize, poolsize), strides=2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(densenodes, activation=tf.nn.relu),
tf.keras.layers.Dense(7,  activation=tf.nn.softmax)
])

model3.compile(optimizer=eval('tf.keras.optimizers.Adam(lr=0.001)'), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
batchsize = 32

In [None]:
#Train model 1
for i in range(4):
    with open ('/kaggle/input/train-pre/train_pre_' + str(i), 'rb') as train_load:
        train = pickle.load(train_load)
    num_train_examples = train.shape[0]
    lblstart = num_train_examples * i
    trainlbl = pd.read_csv('/kaggle/input/bengaliai-cv19/train.csv').iloc[lblstart:lblstart + num_train_examples,1]
    train_data_gen = ImageDataGenerator(rescale=1./255).flow(train,trainlbl)
    del train
    del trainlbl
    gc.collect()
    model1.fit_generator(train_data_gen, epochs=10, steps_per_epoch=math.ceil(num_train_examples/batchsize), verbose=1)
    del train_data_gen
    gc.collect()
model1.save('model1.h5')

In [None]:
#Train model 2
for i in range(4):
    with open ('/kaggle/input/train-pre/train_pre_' + str(i), 'rb') as train_load:
        train = pickle.load(train_load)
    num_train_examples = train.shape[0]
    lblstart = num_train_examples * i
    trainlbl = pd.read_csv('/kaggle/input/bengaliai-cv19/train.csv').iloc[lblstart:lblstart + num_train_examples,2]
    train_data_gen = ImageDataGenerator(rescale=1./255).flow(train,trainlbl)
    del train
    del trainlbl
    gc.collect()
    model2.fit_generator(train_data_gen, epochs=10, steps_per_epoch=math.ceil(num_train_examples/batchsize), verbose=1)
    del train_data_gen
    gc.collect()
model2.save('model2.h5')

In [None]:
#Train model 3
for i in range(4):
    with open ('/kaggle/input/train-pre/train_pre_' + str(i), 'rb') as train_load:
        train = pickle.load(train_load)
    num_train_examples = train.shape[0]
    lblstart = num_train_examples * i
    trainlbl = pd.read_csv('/kaggle/input/bengaliai-cv19/train.csv').iloc[lblstart:lblstart + num_train_examples,3]
    train_data_gen = ImageDataGenerator(rescale=1./255).flow(train,trainlbl)
    del train
    del trainlbl
    gc.collect()
    model3.fit_generator(train_data_gen, epochs=10, steps_per_epoch=math.ceil(num_train_examples/batchsize), verbose=1)
    del train_data_gen
    gc.collect()
model3.save('model3.h5')

In [None]:
#Load models
model1 = tf.keras.models.load_model('/kaggle/input/models-test1/model1.h5')
model2 = tf.keras.models.load_model('/kaggle/input/models-test1/model2.h5')
model3 = tf.keras.models.load_model('/kaggle/input/models-test1/model3.h5')

In [None]:
#Classify test data
out1 = []
out2 = []
out3 = []
for i in range(4):
    data = pd.read_parquet('/kaggle/input/bengaliai-cv19/test_image_data_' + str(i) + '.parquet')
    lbls = data.iloc[:,0]
    data = 255 - data.iloc[:, 1:].values.reshape(-1, HEIGHT, WIDTH).astype(np.uint8)
    resized = []
    for idx in range(len(data)):
        img = (data[idx]*(255.0/data[idx].max())).astype(np.uint8)
        resized.append(crop_resize(img))
    del data
    gc.collect()
    test = np.array(resized).reshape(len(resized),SIZE,SIZE,1)
    del resized
    gc.collect()
    #model1
    predictions = model1.predict(test)
    pred_list = []
    for i in range(predictions.shape[0]):
        pred_list.append(np.argmax(predictions[i]))
    outdf = lbls.to_frame().rename(columns={'image_id':'row_id'})
    outdf['target'] = pred_list
    outdf['row_id'] = outdf['row_id'] + '_grapheme_root'
    out1.append(outdf)
    #model2
    predictions = model2.predict(test)
    pred_list = []
    for i in range(predictions.shape[0]):
        pred_list.append(np.argmax(predictions[i]))
    outdf = lbls.to_frame().rename(columns={'image_id':'row_id'})
    outdf['target'] = pred_list
    outdf['row_id'] = outdf['row_id'] + '_vowel_diacritic'
    out2.append(outdf)
    #model3
    predictions = model3.predict(test)
    pred_list = []
    for i in range(predictions.shape[0]):
        pred_list.append(np.argmax(predictions[i]))
    outdf = lbls.to_frame().rename(columns={'image_id':'row_id'})
    outdf['target'] = pred_list
    outdf['row_id'] = outdf['row_id'] + '_consonant_diacritic'
    out3.append(outdf)
out1 = pd.concat(out1)
out2 = pd.concat(out2)
out3 = pd.concat(out3)

In [None]:
#Generate submission
submission = pd.concat([out1,out2,out3]).reset_index(drop=True)
submission.to_csv('submission.csv', index=False)