In [None]:
import sklearn
import keras
import sys
import time
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
import math
import cv2

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Activation, Dropout, Flatten, Dense, Bidirectional
from keras.layers.pooling import GlobalAveragePooling2D
from keras.layers import Conv2D, MaxPooling2D, MaxPooling1D, Input, BatchNormalization, concatenate
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils import shuffle

import skimage
from skimage import io
from skimage.transform import resize

from numpy.random import RandomState

In [None]:
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
import re

lemmatizer = WordNetLemmatizer() 
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(data):
    newData = []
    for title in data:
        title = re.sub(r'[0-9]+', '', title)
        new = " "
        for word in title.split(' '):
            
            if word not in stopwords:
                new += lemmatizer.lemmatize(word) + ' '
        newData.append(new)
        
    return newData

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [None]:
def getMaximumLen(data):
    maxL = 0
    for tweet in data:
        l = 0
        for word in tweet.split(' '):
            l += 1
        if (l>maxL):
            maxL = l    
    return maxL

In [None]:
columns = ["Image Path", "Title", "Category ID", "Category"]
data = pd.read_csv('train.csv', encoding = "ISO-8859-1", header=None,
                   usecols=[1,3,6,5], names=columns)

In [None]:
data

In [None]:
image = data['Image Path']
x = data['Title'].str.lower()
y = to_categorical(LabelEncoder().fit_transform(data['Category']), num_classes=30)
x = preprocess(x)

In [None]:
x

In [None]:
t = Tokenizer()
t.fit_on_texts(x)

In [None]:
wordIndex = t.word_index

In [None]:
vocab_size = len(wordIndex) + 1

In [None]:
encodedX = t.texts_to_sequences(x)
maxlen = getMaximumLen(x)
X = pad_sequences(encodedX, maxlen=maxlen, padding='post')

In [None]:
from tqdm import tqdm

embedding_vector = {}
f = open('glove.6B.100d.txt')

In [None]:
import _pickle

embeddings_index = dict()
for line in tqdm(f):
    values = line.split(" ")
    key = values[0]
    c = np.asarray(values[1:], dtype='float32')
    embeddings_index[key] = c 

In [None]:
embeddings_matrix = np.zeros((vocab_size, 100))

In [None]:
for word, i in t.word_index.items():
    vector = embeddings_index.get(word)
    if vector is not None:
         embeddings_matrix[i] = vector
    else:
        embeddings_matrix[i] = np.random.randn(100)

In [None]:
xtrain, xval, ytrain, yval, imagetrain, imageval = train_test_split(X, y, image, train_size=0.8, random_state=1)

In [None]:
import functools

top3_acc = functools.partial(keras.metrics.top_k_categorical_accuracy, k=3)

top3_acc.__name__ = 'top3_acc'

inp = Input(shape=(maxlen,))
x = Embedding(vocab_size, 100, weights=[embeddings_matrix])(inp)
x = Bidirectional(LSTM(100, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))(x)
x = keras.layers.GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.3)(x)
x = Dense(30, activation="softmax")(x)
model = Model(inputs=inp, outputs=x)

model.summary()

model.compile(keras.optimizers.Adam(),
        loss='categorical_crossentropy',
        metrics=['accuracy', top3_acc, f1_m])

optimizer = keras.optimizers.RMSprop()
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy',f1_m])

history = model.fit(xtrain, ytrain, 
                validation_data=(xval, yval), 
                epochs=50, batch_size=128, 
                callbacks=[EarlyStopping(monitor='val_loss', patience=10),
                           ModelCheckpoint(filepath='lstm.h5', monitor='val_loss', save_best_only=True)]
                   )

In [None]:
import random 
import functools

top3_acc = functools.partial(keras.metrics.top_k_categorical_accuracy, k=3)

top3_acc.__name__ = 'top3_acc'


def create_model(params):
    
    model = keras.Sequential()
    
    model.add(keras.layers.Embedding(vocab_size, 100, weights=[embeddings_matrix], input_length=maxlen, 
                                     trainable=False))
    
    for i in range(params['LSTM_layers']-1):
        model.add(Bidirectional(LSTM(units=params['LSTM_units'], 
                                     return_sequences=True, dropout=params['LSTM_dropout'], 
                                     recurrent_dropout=params['recc_dropout'])))

        
    model.add(Bidirectional(LSTM(units=params['LSTM_units'], return_sequences=True, dropout=params['LSTM_dropout'], 
                                     recurrent_dropout=params['recc_dropout'])))
    
    model.add(keras.layers.GlobalMaxPool1D())
    
    for i in range(params['dense_layers']):
        model.add(keras.layers.Dense(units=params['dense_units'], activation='relu'))
        model.add(keras.layers.Dropout(params['dropout']))
        
    model.add(keras.layers.Dense(units=30, activation='softmax')) 
    
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy',top3_acc, f1_m])
    
    return model

def random_search(params, n, xtrain, ytrain, xval, yval, epochs):
    models = {}
    i = 0
    while (i<n):
        print("Iteration number " , i+1)
        passing_param = {}
        passing_param.update({'LSTM_dropout' : random.choice(params['LSTM_dropout'])})
        passing_param.update({'recc_dropout' : random.choice(params['recc_dropout'])})
        passing_param.update({'dropout' : random.choice(params['dropout'])})
        passing_param.update({'LSTM_units' : random.choice(params['LSTM_units'])})
        passing_param.update({'dense_units' : random.choice(params['dense_units'])})
        passing_param.update({'batch_size' : random.choice(params['batch_size'])})
        passing_param.update({'LSTM_layers' : random.choice(params['LSTM_layers'])})
        passing_param.update({'dense_layers' : random.choice(params['dense_layers'])})
        if str(passing_param) not in models:
            i+=1
            model = create_model(passing_param)
            history = model.fit(xtrain, ytrain, validation_data=(xval, yval), epochs=epochs, 
                            batch_size=passing_param['batch_size'],
                                callbacks=[EarlyStopping(monitor='val_loss', patience=10,
                                                         restore_best_weights=True),
                            ModelCheckpoint(filepath='bestmodel'+str(i)+'.h5', 
                                            monitor='val_loss', save_best_only=True)])
            
            val_acc = history.history['val_acc']
            val_f1_m = history.history['val_f1_m']
            val_top3_acc = history.history['val_top3_acc']
            models.update({str(passing_param): [val_acc, val_f1_m, val_top3_acc] })
        
    return models

In [None]:
params = { 
    'LSTM_dropout': [0.3, 0.4, 0.5],
    'dropout' : [0.3, 0.4, 0.5],
    'LSTM_units' : [100, 150, 200, 250],
    'dense_units' : [32, 50, 128],
    'batch_size' : [32, 64, 128],
    'recc_dropout': [0.2, 0.3, 0.4],
    'LSTM_layers' : [1, 2, 3],
    'dense_layers' : [1, 2, 3],
    }

models = random_search(params, 15, xtrain, ytrain, xval, yval, 30)

In [None]:
BestAccuracy = 0
BestF1 = 0
BestHyp = []

for key, value in models.items():
    
    f1 = 0
    acc = 0
    for epoch in range(10): 
        f1 += value[1][epoch]
        acc += value[0][epoch]
    avgF1 = f1/10
    avgAcc = acc/10
    
    if (avgF1>BestF1):
        BestF1 = value[1][9]
        BestAccuracy = value[0][9]
        BestHyp = key
        
print("Hyperparameters of the best model =  " , BestHyp)
print('Validation Accuracy is ', BestAccuracy, ' and Validation F-Mesure is ', BestF1)