In [None]:

# # GLOVE

# # Initialise config

In [None]:


get_ipython().system('pip install nltk')
get_ipython().system('pip install --upgrade tensorflow_hub')
get_ipython().system('pip install --upgrade pydot')
get_ipython().system('pip install --upgrade graphviz')
get_ipython().system('pip install --upgrade matplotlib')

In [None]:


import os
import pandas as pd
from IPython.display import clear_output

import matplotlib.pyplot as plt
import matplotlib.image as mpimg


import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import tensorflow.keras as keras
import pickle

import cv2
import scipy

import numpy as np

import tensorflow_hub as hub
import nltk



#HYPERPARAMETERS
BATCH_SIZE=64
EPOCHS=20

new_shape = (150,150)


max_words = 15



labels_dict = {
    0: 'missing',
    1: 'not-easy-fraud',
    2: 'replaced'
}

rev_labels_dict = {
    'missing': 0,
    'not-easy-fraud': 1,
    'replaced': 2
}


'''
Need to revise these values after imbalanced augmentation
'''
num_records = {
    'missing': 1000,
    'not-easy-fraud': 30000,
    'replaced': 350
}

num_augmentations = {
    'missing': 5,
    'not-easy-fraud': 3,
    'replaced': 10
}

num_records_after_augmentation = {
    'missing': num_records['missing']*num_augmentations['missing'],
    'not-easy-fraud': num_records['not-easy-fraud']*num_augmentations['not-easy-fraud'],
    'replaced': num_records['replaced']*num_augmentations['replaced']
}

# Majority class (not fraud)
records = num_records_after_augmentation['not-easy-fraud']
total_records = sum([num_records_after_augmentation[key] for key in num_records_after_augmentation])
missing_records = num_records_after_augmentation['missing']
replaced_records = num_records_after_augmentation['replaced']

print(records)
print(total_records)
print(num_records_after_augmentation['missing'])
print(num_records_after_augmentation['replaced'])

class_weight = {
    0: records/missing_records,
    1: 1,
    2: records/replaced_records
}
print(class_weight)

In [None]:


# !pip install --upgrade tensorflow_hub
# !pip install --upgrade pydot
# !pip install --upgrade graphviz

In [None]:





# # Data loading and Analysis

# ## Download the files
# <a id='cell:data_description'></a>
# Please download the data from the following [WorkDocs link](https://amazon.awsapps.com/workdocs/index.html#/folder/4aefea54a65e7377cd884e1a54f341dc62aec676d88ac31127f4d88e75d63f28). It includes the following files:
# - images.part_aa
# - images.part_ab
# - images.part_ac
# - images.part_ad
# - train_data.csv
# - test_features.csv
# 
# The first 4 files are part files which when concatenated lead to a tar ball containing the image files. The `train_data.csv` contains the training data, while the `test_features.csv` contains the test data (used for both the Public and Private Leaderboard) and thus **does not** include the labels.

# ## Download google universal encoder

In [None]:


#download the model to local so it can be used again and again
# !mkdir universalencoder
# Download the module, and uncompress it to the destination folder. 
# !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/3?tf-hub-format=compressed" | tar -zxvC universalencoder

In [None]:


# Let's see what kind of data we're dealing with
train_data = pd.read_csv('WW Returns/train_data.csv')
train_data.tail(100)


# ## Getting the images
# The images are present in a tar file which has been split into part files for making the download/upload convenient. First make sure that you have all the 4 files mentioned [here](#cell:data_description). Run the following cell to stitch the part files together and get the images.

In [None]:


# Let's try to look at an image for a return!
get_ipython().magic('pylab inline')

index = -3
image_id, label = train_data.iloc[index]['image_id'], train_data.iloc[index]['label']
image_path = os.path.join('WW Returns/images', image_id)
image = mpimg.imread(image_path)
plt.figure(figsize = (5,5))
plt.imshow(image)
plt.title("Label: {}".format(label), fontsize=16)
plt.show()

In [None]:





# # Preprocessing
# 
# - Get sample batch and fit it on ImageDataGenerator for zca
# - Define train and validation set from pandas dataframe

In [None]:



# Pre-trained Xception weights requires that input be scaled
# from (0, 255) to a range of (-1., +1.), the rescaling layer
def preprocessing_fun(image):
    img = np.array(image)
    img /= 127.5
    img -= 1
    return img


datagen = ImageDataGenerator(
#     zca_whitening=True,
    rotation_range=90,
    width_shift_range=0.2,
    height_shift_range=0.2,
    preprocessing_function = preprocessing_fun, 
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[0.2,1.0],
    fill_mode='nearest')


# Need to fit this since zca_whitening is used
# train_sample = datagen.flow_from_dataframe(dataframe=train_data[0:5], 
#                                               directory='WW Returns/images/',
#                                               x_col="image_id", 
#                                               y_col="label", 
#                                               class_mode="categorical",
#                                                 classes=['missing', 'not-easy-fraud', 'replaced'],
#                                               target_size=(150, 150), 
#                                         color_mode='rgb',
#                                               batch_size=5)
# for batch in train_sample:
#     datagen.fit(batch[0])
#     break
    
    

with open('datagen.pb', 'wb') as datagen_file:
    pickle.dump(datagen, datagen_file)
    

with open('datagen.pb', 'rb') as datagen_file:
    datagen = pickle.load(datagen_file)






# ### Text Preprocessing

In [None]:


# There will be 3 text inputs
# 1. gl_product: Here we will split by ('_') and remove numbers eg. gl_product_9_digital and remove the first 'gl'
# 2. cat_desc: Here - "1000 Point_&_Shoot", "1400 Health & Wellness (121)" we will remove number and & and (121) and _ and /
# 3. subcat_desc: Same as above and DELETED and remove any word that contains numbers

In [None]:


import re

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
nltk.download('stopwords')
nltk.download('wordnet')

def split(string):
    delimiters = ",", "_", "/", "&", " "
    regexPattern = '|'.join(map(re.escape, delimiters))
    return re.split(regexPattern, string)

def remove(strList):
    # Given list of words, remove/edit some words eg. (121)
    # Remove words that contain numbers
    # Remove DELETED
    # Remove gl
    return [i.lower() for i in strList if i.isalpha() and i != 'DELETED' and i != 'gl']

def embedWords(listString):
    # Either pass one word: "Computer" or sentence "I am a computer"
    return embed(listString)

def remove_stopwords(tokenized_list):
    # Remove all English Stopwords
    stopword = nltk.corpus.stopwords.words('english')
    text = [word for word in tokenized_list if word not in stopword]
    return text

def stemming(tokenized_text):
    ps = nltk.PorterStemmer()
    text = [ps.stem(word) for word in tokenized_text]
    return text

def lemmatizing(tokenized_text):
    wn = nltk.WordNetLemmatizer()
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text



def process_text(row):
    gl = row['gl_product_group_desc']
    cat = row['cat_desc']
    subcat = row['subcat_desc']
    
#     print("GL: " + gl)
    splitGl = split(gl)
    finalGl = remove(splitGl)
#     print("SplitGL: " + str(splitGl))
#     print("FinalGL: " + finalGl)
#     print()
    
#     print("Cat: " + cat)
    splitCat = split(cat)
    finalCat = remove(splitCat)
#     print("SpCat: " + str(splitCat))
#     print("FCat: " + finalCat)
#     print()
    
#     print("SubCat: " + subcat)
    splitSubcat = split(subcat)
    finalSubcat = remove(splitSubcat)
#     print("Split SubCat: " + str(splitSubcat))
#     print("FSubCat: " + finalSubcat)
#     print()
    
#     print("--------------------\n")
    words = [i for i in finalGl+finalCat+finalSubcat]
    words = remove_stopwords(words)
    words = stemming(words)
    words = lemmatizing(words)
#     return embedWords(words)
    return words

In [None]:





# ### Get total list of words in train and test

In [None]:


wordList = set()
word_samples = []

# Train
for i,data in train_data.iterrows():
    words = process_text(data)
    word_samples.append(words)
    for word in words:
        wordList.add(word)
        
# Test
test_features = pd.read_csv('WW Returns/test_features.csv')
for i,data in test_features.iterrows():
    words = process_text(data)
    word_samples.append(words)
    for word in words:
        wordList.add(word)
        
print(len(wordList))
    
    



In [None]:


# !curl http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip --output glove.6B.zip
# !unzip glove.6B.zip


# ## Initialise glove

In [None]:


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

vocabulary_size = len(wordList)


embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:


tokenizer = Tokenizer(num_words=vocabulary_size)
texts = []
for sample in word_samples:
    texts.append(' '.join(sample))
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index

In [None]:


# word_index

In [None]:


# word_data.shape

In [None]:


embedding_dim = 100

embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < vocabulary_size:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [None]:


# embedding_matrix.shape

In [None]:





# ## Shuffle data

In [None]:


train_data = train_data.sample(frac=1).reset_index(drop=True)
train_data = pd.read_csv('dtrain.csv')

In [None]:





# ### Data Generators: Option 1

In [None]:


train_gen = datagen.flow_from_dataframe(dataframe=train_data[:-1000], 
                                              directory='WW Returns/images/',
                                              x_col="image_id", 
                                              y_col="label", 
                                              class_mode="categorical",
                                                classes=['missing', 'not-easy-fraud', 'replaced'],
                                              target_size=(150, 150), 
                                        color_mode='rgb',
                                              batch_size=256)
# Classes are encoded in 1 hot vector by alphabetical order


val_gen = datagen.flow_from_dataframe(dataframe=train_data[-1000:], 
                                              directory='WW Returns/images/',
                                              x_col="image_id", 
                                              y_col="label", 
                                              class_mode="categorical",
                                                classes=['missing', 'not-easy-fraud', 'replaced'],
                                              target_size=(150, 150), 
                                      color_mode='rgb',
                                              batch_size=256)


# ### Data Generators: Option 2

In [None]:


class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, dataframe, batch_size=32):
        'Initialization'
        self.batch_size = batch_size
        self.dataframe = dataframe
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.dataframe) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        # indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
        # Find list of IDs
        dataframe_batch = self.dataframe[index*self.batch_size:(index+1)*self.batch_size]
        # [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(dataframe_batch)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.dataframe))

    def __data_generation(self, dataframe_batch):
        'Generates data containing batch_size samples'
    
        X_images = []
        X_text = []
        Y = []
        for index, data in dataframe_batch.iterrows():
            label = data['label']
            y = rev_labels_dict[label]
            image_id = data['image_id']
            
            x_text = process_text(data)
            x_text = ' '.join(x_text)
            sequence = tokenizer.texts_to_sequences([x_text])
            x_text_padded = pad_sequences(sequence, maxlen=max_words, padding='post')[0]

            image_path = os.path.join('WW Returns', 'images', image_id)
            image = cv2.imread(image_path).astype(np.float32)

            image = cv2.resize(image, new_shape)
            preprocessed_image = preprocessing_fun(image)
            X_images.append(preprocessed_image)
            X_images.append(preprocessed_image)
            Y.append(y)
            Y.append(y)
            X_text.append(x_text_padded)
            X_text.append(x_text_padded)
            
            

            num_of_aug = 1
            MAX_NUM_OF_AUG = num_augmentations[label] - 2
            for batch in datagen.flow(np.expand_dims(image, axis=0)):
                aug_image = batch[0]

                X_images.append(aug_image)
                Y.append(y)
                X_text.append(x_text_padded)

                if num_of_aug >= MAX_NUM_OF_AUG:
                    break
                num_of_aug = num_of_aug+1

                
        return [np.array(X_images), np.array(X_text)], keras.utils.to_categorical(Y, num_classes=3)
    


    


In [None]:





# # Define Model

In [None]:



base_model = keras.applications.Xception(
    weights="imagenet",  # Load weights pre-trained on ImageNet.
    input_shape=(150, 150, 3),
    include_top=False,
)  # Do not include the ImageNet classifier at the top.

# Freeze the base_model
base_model.trainable = False

# Create new model on top
image_input = keras.Input(shape=(150, 150, 3), name='image_input')


# The base model contains batchnorm layers. We want to keep them in inference mode
# when we unfreeze the base model for fine-tuning, so we make sure that the
# base_model is running in inference mode here.
base_cnn_with_input = base_model(image_input, training=False)
cnn_gavg = keras.layers.GlobalAveragePooling2D()(base_cnn_with_input)
cnn_drop = keras.layers.Dropout(0.2)(cnn_gavg)  # Regularize with dropout
# cnn_dense_32 = keras.layers.Dense(32, activation='relu')(cnn_drop)
cnn_dense_16 = keras.layers.Dense(8, activation='relu')(cnn_drop)

In [None]:


word_embeddings = keras.Input(shape=(None,), name='word_embeddings')
embedded_tag = keras.layers.Embedding(vocabulary_size, embedding_dim, input_length=max_words)(word_embeddings)

encoded_tag = keras.layers.LSTM(128)(embedded_tag)
# x_text_layer = keras.layers.Dense(32, activation='relu')(encoded_tag)
x_text_layer = keras.layers.Dense(8, activation='relu')(encoded_tag)

In [None]:


concatenated = keras.layers.concatenate([cnn_dense_16, x_text_layer], axis=-1)
# concatenated = keras.layers.Dense(16, activation='relu')(concatenated)

outputs = keras.layers.Dense(3, activation='softmax')(concatenated)
model = keras.Model([image_input, word_embeddings] , outputs)

model.summary()

tf.keras.utils.plot_model(model)

In [None]:


model.layers[4].set_weights([embedding_matrix])
model.layers[4].trainable = False # freeze GloVe word embedding

In [None]:





# # Train and Evaluate

In [None]:


# Save JSON config to disk

def save_model(iteration=None, model=model):
    json_config = model.to_json()
    with open('model_config.json', 'w') as json_file:
        json_file.write(json_config)
    # Save weights to disk
    
    if iteration:
        file_path = 'path_to_my_weights_' + iteration + '.h5'
    else:
        file_path = 'path_to_my_weights.h5'
    file1 = open(file_path,"w")
    file1.close()
    
    model.save_weights("path_to_my_weights.h5")
    
    return model

In [None]:


def load_model():
    # Reload the model from the 2 files we saved
    with open('model_config.json') as json_file:
        json_config = json_file.read()
    new_model = keras.models.model_from_json(json_config)
    file_path = 'path_to_my_weights.h5'
    new_model.load_weights(file_path)
    return new_model


# # Test

In [None]:


#%%capture test

def test_full(model):
    
    import string
    import random

    # initializing size of string 
    N = 7
    # using random.choices()
    # generating random strings 
    res = ''.join(random.choices(string.ascii_uppercase +
                                 string.digits, k = N))
    file_path = 'predictions_' + res + '.csv'
    
    # Let's look at the test data we need to make predictions for
    test_features = pd.read_csv('WW Returns/test_features.csv')
    test_features.head()

    # Below is an example submission of a baseline model with directions on how to submit to Leaderboard
    result_df = pd.DataFrame()
    result_df['ID'] = test_features['ID']

    result_df = pd.read_csv('Results/predictions.csv')
    result_df['ID'] = test_features['ID']
    result_df['label'] = ''



    NUM_OF_TEST_AUGMENTATIONS = 10

    for index, data in test_features.iterrows():
        if (result_df.at[index,'label'] != ''):
            continue

        image_id = data['image_id']
        test_id = data['ID']

        print("Index: " + str(index))
        print("Image_id: " + str(image_id))
        print("ID: " + str(test_id))
        
        
        x_text = process_text(data)
        x_text = ' '.join(x_text)
        sequence = tokenizer.texts_to_sequences([x_text])
        x_text_padded = pad_sequences(sequence, maxlen=max_words, padding='post')[0]


        image_path = os.path.join('WW Returns', 'images', image_id)
        image = cv2.imread(image_path).astype(np.float32)
        image = cv2.resize(image, new_shape)

        preprocessed_image = preprocessing_fun(image)
        img_arr = [preprocessed_image, preprocessed_image, preprocessed_image]
        txt_arr = [x_text_padded, x_text_padded, x_text_padded]

        num = 1
        for batch in datagen.flow(np.expand_dims(image, axis=0)):
            img_arr.append(batch[0])
            txt_arr.append(x_text_padded)

            if num >= NUM_OF_TEST_AUGMENTATIONS:
                break
            num = num+1

        img_arr = np.array(img_arr)
        txt_arr = np.array(txt_arr)
        predictions = model.predict([img_arr, txt_arr])

        predictions = np.sum(predictions, axis=0)
        print(predictions)

        label = labels_dict[np.argmax(predictions)]
        print(label)
        result_df.at[index,'label']=label
        print("\n\n")
        result_df.to_csv(os.path.join('Results', file_path), index = False)
        clear_output(wait=True)

In [None]:


# %%capture train
BATCH_SIZE=150

# Generators
train_gen = DataGenerator(train_data[:-1000], batch_size=BATCH_SIZE)
val_gen = DataGenerator(train_data[-1000:], batch_size=BATCH_SIZE)


def fine_tune(model):
    history = model.fit(val_gen, epochs=2, validation_data=val_gen, class_weight=class_weight)
    # Fine tuning
    # Unfreeze the base model
    base_model.trainable = True
    model.layers[4].trainable = True
    model.compile(
        optimizer=keras.optimizers.Adam(1e-5),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    model.fit(train_gen, epochs=2, validation_data=val_gen, class_weight=class_weight)
    base_model.trainable = False
    model.layers[4].trainable = False # freeze GloVe word embedding
    return model

def train(model, resume=False):
    if resume:
        model = load_model()
        model.compile(
            optimizer=keras.optimizers.Adam(),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        model = fine_tune(model=model)
        test_full(model=model)
        model = load_model()
        
    for i in range(10):
        model.compile(
            optimizer=keras.optimizers.Adam(),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        for ep in range(5):
            history = model.fit(train_gen, epochs=1, validation_data=val_gen, class_weight=class_weight)
            model = save_model(iteration=str(i)+'-'+str(ep), model=model)
        
        model = fine_tune(model=model)
        test_full(model=model)
        model = load_model()

    loss, acc = model.evaluate(val_gen)  # returns loss and metrics
    print("loss: %.2f" % loss)
    print("acc: %.2f" % acc)

In [None]:


train(model, True)