<a href="https://colab.research.google.com/github/codx-aks/TriNitt-ML/blob/main/TRINITT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
from textwrap import wrap
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import cv2
import matplotlib.pyplot as plt
from tensorflow.keras import Input
from tensorflow.keras.layers import (
    GRU,
    Add,
    AdditiveAttention,
    Attention,
    Concatenate,
    Dense,
    Embedding,
    LayerNormalization,
    Reshape,
    StringLookup,
    TextVectorization,
)

print(tf.version.VERSION)

In [None]:
# Change these to control the accuracy/speed
VOCAB_SIZE = 10000  # use fewer words to speed up convergence
ATTENTION_DIM = 512  # size of dense layer in Attention
WORD_EMBEDDING_DIM = 128

# InceptionResNetV2 takes (224, 224, 3) image as inputs
# and return features in (5, 5, 1536) shape
FEATURE_EXTRACTOR = tf.keras.applications.inception_resnet_v2.InceptionResNetV2(
    include_top=False, weights="imagenet"
)
IMG_HEIGHT = 224
IMG_WIDTH = 224
IMG_CHANNELS = 3
FEATURES_SHAPE = (5, 5, 1536)

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import tensorflow as tf
from PIL import Image
import numpy as np
import ast
import io

BUFFER_SIZE = 1000

def decode_and_resize_image(image_bytes):
    img = tf.image.decode_jpeg(image_bytes, channels=3)
    img = tf.image.resize(img, [IMG_WIDTH, IMG_HEIGHT])
    # Normalize pixel values
    img = img / 255.0
    return img

def get_image_label(filename, caption, img):
    caption_list = [caption] if not isinstance(caption, list) else caption
    first_caption = caption_list[0]
    print(len(caption_list))
    return {"image_tensor": img, "caption": first_caption}


# Define mapping function
def map_fn(filename, caption, image_bytes):
    # Decode and resize image
    print("Type of image_bytes:", type(image_bytes))
    print("insideeee")
    img = decode_and_resize_image(image_bytes)
    return get_image_label(filename, caption, img)









In [None]:
# Read dataset
dataset_path = "/content/drive/My Drive/archive/train.csv"
df = pd.read_csv(dataset_path)

# Parse the stringified dictionaries in the "image" column
df['image'] = df['image'].apply(ast.literal_eval)

# Define a function to extract the image bytes from the nested dictionary
def extract_image_bytes(image):
    return image.get("bytes", None)

# Extract image bytes from the "image" column
df["image_bytes"] = df["image"].apply(extract_image_bytes)

print("Data type of image_bytes:", df["image_bytes"].dtype)
print("Shape of image_bytes:", df["image_bytes"].shape)
print("Head of image_bytes:")
print(df["image_bytes"].head())

# Create dataset
trainds = tf.data.Dataset.from_tensor_slices((df["filename"], df["captions"], df["image_bytes"]))
# Apply mapping function to the dataset
trainds = trainds.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE).shuffle(BUFFER_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)


In [None]:

# Read dataset
dataset_path_test = "/content/drive/My Drive/archive/test.csv"
dftest = pd.read_csv(dataset_path_test)

# Parse the stringified dictionaries in the "image" column
dftest['image'] = dftest['image'].apply(ast.literal_eval)

# Extract image bytes from the "image" column
dftest["image_bytes"] = dftest["image"].apply(extract_image_bytes)

print("Data type of image_bytes:", dftest["image_bytes"].dtype)
print("Shape of image_bytes:", dftest["image_bytes"].shape)
print("Head of image_bytes:")
print(dftest["image_bytes"].head())

# Create dataset
testds = tf.data.Dataset.from_tensor_slices((dftest["filename"], dftest["captions"], dftest["image_bytes"]))

# Apply mapping function to the dataset
testds = testds.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE).shuffle(BUFFER_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:

# Read dataset
dataset_path_val = "/content/drive/My Drive/archive/valid.csv"
dfval = pd.read_csv(dataset_path_val)

# Parse the stringified dictionaries in the "image" column
dfval['image'] = dfval['image'].apply(ast.literal_eval)

# Extract image bytes from the "image" column
dfval["image_bytes"] = dfval["image"].apply(extract_image_bytes)

print("Data type of image_bytes:", dfval["image_bytes"].dtype)
print("Shape of image_bytes:", dfval["image_bytes"].shape)
print("Head of image_bytes:")
print(dfval["image_bytes"].head())

# Create dataset
valds = tf.data.Dataset.from_tensor_slices((dfval["filename"], dfval["captions"], dfval["image_bytes"]))

# Apply mapping function to the dataset
valds = valds.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE).shuffle(BUFFER_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:

import matplotlib.pyplot as plt

# Fetch data example
for data in valds.take(4):
    byte_array = data['image_tensor']
    # Convert to NumPy array using TensorFlow's NumPy compatibility feature
    byte_array_np = tf.experimental.numpy.asarray(byte_array)
    # Convert back to regular NumPy array for image processing
    byte_array_np = np.array(byte_array_np)
    image = (byte_array_np * 255).astype(np.uint8)

    # Extract the caption
    caption = data['caption'].numpy().decode('utf-8')  # Decode bytes to string

    # Display the image and caption
    plt.imshow(image)
    plt.title(caption)
    plt.axis('off')  # Turn off axis
    plt.show()


In [None]:
def add_start_end_token(data):
    start = tf.convert_to_tensor("<start>")
    end = tf.convert_to_tensor("<end>")
    data["caption"] = tf.strings.join(
        [start, data["caption"], end], separator=" "
    )
    return data


trainds = trainds.map(add_start_end_token)
testds = testds.map(add_start_end_token)
valds = valds.map(add_start_end_token)

In [None]:
MAX_CAPTION_LEN = 128


# We will override the default standardization of TextVectorization to preserve
# "<>" characters, so we preserve the tokens for the <start> and <end>.
def standardize(inputs):
    inputs = tf.strings.lower(inputs)
    return tf.strings.regex_replace(
        inputs, r"[!\"#$%&\(\)\*\+.,-/:;=?@\[\\\]^_`{|}~]?", ""
    )


# Choose the most frequent words from the vocabulary & remove punctuation etc.
tokenizer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    standardize=standardize,
    output_sequence_length=MAX_CAPTION_LEN,
)

tokenizer.adapt(trainds.map(lambda x: x["caption"]))
tokenizer.adapt(valds.map(lambda x: x["caption"]))
tokenizer.adapt(testds.map(lambda x: x["caption"]))

In [None]:
tokenizer(["<start> This is a sentence <end>"])

In [None]:
sample_captions = []
for d in trainds.take(5):
    sample_captions.append(d["caption"].numpy())

In [None]:
for wordid in tokenizer([sample_captions[0]])[0]:
    print(tokenizer.get_vocabulary()[wordid], end=" ")

In [None]:
# Lookup table: Word -> Index
word_to_index = StringLookup(
    mask_token="", vocabulary=tokenizer.get_vocabulary()
)

# Lookup table: Index -> Word
index_to_word = StringLookup(
    mask_token="", vocabulary=tokenizer.get_vocabulary(), invert=True
)

In [None]:
BATCH_SIZE = 256


def create_ds_fn(data):
    img_tensor = data["image_tensor"]
    caption = tokenizer(data["caption"])

    target = tf.roll(caption, -1, 0)
    zeros = tf.zeros([1], dtype=tf.int64)
    target = tf.concat((target[:-1], zeros), axis=-1)
    return (img_tensor, caption), target


batched_ds = (
    trainds.map(create_ds_fn)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

batched_val_ds = (
    valds.map(create_ds_fn)
    .batch(128, drop_remainder=True)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

batched_test_ds = (
    testds.map(create_ds_fn)
    .batch(128, drop_remainder=True)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

In [None]:
print(len(batched_val_ds))
for (img, caption), label in batched_val_ds.take(2):
    print(f"Image shape: {img.shape}")
    print(f"Caption shape: {caption.shape}")
    print(f"Label shape: {label.shape}")
    print(caption[0])
    print(label[0])

In [None]:
FEATURE_EXTRACTOR.trainable = False

image_input = Input(shape=(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS))
image_features = FEATURE_EXTRACTOR(image_input)

x = Reshape((FEATURES_SHAPE[0] * FEATURES_SHAPE[1], FEATURES_SHAPE[2]))(image_features)
encoder_output = Dense(ATTENTION_DIM, activation="relu")(x)

In [None]:
encoder = tf.keras.Model(inputs=image_input, outputs=encoder_output)
encoder.summary()

In [None]:
word_input = Input(shape=(MAX_CAPTION_LEN), name="words")
embed_x = Embedding(VOCAB_SIZE, ATTENTION_DIM)(word_input)

decoder_gru = GRU(
    ATTENTION_DIM,
    return_sequences=True,
    return_state=True,
)
gru_output, gru_state = decoder_gru(embed_x)

decoder_attention = Attention()
context_vector = decoder_attention([gru_output, encoder_output])

addition = Add()([gru_output, context_vector])

layer_norm = LayerNormalization(axis=-1)
layer_norm_out = layer_norm(addition)

decoder_output_dense = Dense(VOCAB_SIZE)
decoder_output = decoder_output_dense(layer_norm_out)


In [None]:
decoder = tf.keras.Model(
    inputs=[word_input, encoder_output], outputs=decoder_output
)
# tf.keras.utils.plot_model(decoder)

In [None]:
decoder.summary()

In [None]:
image_caption_train_model = tf.keras.Model(
    inputs=[image_input, word_input], outputs=decoder_output
)


In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)


def loss_function(real, pred):
    loss_ = loss_object(real, pred)

    # returns 1 to word index and 0 to padding (e.g. [1,1,1,1,1,0,0,0,0,...,0])
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=tf.int32)
    sentence_len = tf.reduce_sum(mask)
    loss_ = loss_[:sentence_len]

    return tf.reduce_mean(loss_, 1)

In [None]:
image_caption_train_model.compile(
    optimizer="adam",
    loss=loss_function,
     metrics=['accuracy']
)

In [None]:
!pip install sacrebleu

In [None]:
!pip install evaluate sacrebleu

In [None]:
# %%time
# history = image_caption_train_model.fit(batched_ds, epochs=1)
EPOCHS=1
# Train the model
history = image_caption_train_model.fit(batched_ds, epochs=EPOCHS, validation_data=batched_val_ds)



In [None]:
image_caption_train_model.save('/content/drive/My Drive/gru_model2.h5')

In [None]:
test_loss = image_caption_train_model.evaluate(batched_test_ds)

print("Test loss:", test_loss)



In [None]:
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Plot the training and validation accuracy
plt.figure(figsize=(10, 5))
plt.plot(train_accuracy, label='Training Accuracy', color='blue')
plt.plot(val_accuracy, label='Validation Accuracy', color='red')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot the training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(train_loss, label='Training Loss', color='blue')
plt.plot(val_loss, label='Validation Loss', color='red')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [14]:
BATCH_SIZE = 256


def create_ds_fn(data):
    img_tensor = data["image_tensor"]
    caption = tokenizer(data["caption"])

    target = tf.roll(caption, -1, 0)
    zeros = tf.zeros([1], dtype=tf.int64)
    target = tf.concat((target[:-1], zeros), axis=-1)
    return (img_tensor, caption), target


batched_val_ds = (
    valds.map(create_ds_fn)
    .batch(128, drop_remainder=True)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

batched_test_ds = (
    testds.map(create_ds_fn)
    .batch(128, drop_remainder=True)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

NameError: in user code:

    File "<ipython-input-14-6eb0561b4ecf>", line 6, in create_ds_fn  *
        caption = tokenizer(data["caption"])

    NameError: name 'tokenizer' is not defined


In [13]:
import tensorflow as tf
import pandas as pd

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)


def loss_function(real, pred):
    loss_ = loss_object(real, pred)

    # returns 1 to word index and 0 to padding (e.g. [1,1,1,1,1,0,0,0,0,...,0])
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=tf.int32)
    sentence_len = tf.reduce_sum(mask)
    loss_ = loss_[:sentence_len]

    return tf.reduce_mean(loss_, 1)

saved_model_path = '/content/drive/My Drive/gru_model.h5'
loaded_model = tf.keras.models.load_model(saved_model_path,custom_objects={'loss_function': loss_function})

# Assuming batched_val_ds is your validation dataset
# Make predictions on the validation dataset
predictions_val = loaded_model.predict(valds)

# Convert predictions to a DataFrame
predictions_dfval = pd.DataFrame(predictions_val)

# Save DataFrame to a CSV file
csv_filename_val = "/content/drive/My Drive/validation_predictions.csv"
predictions_dfval.to_csv(csv_filename_val, index=False)

print("Shape of validation predictions:", predictions_val.shape)
print("Predictions saved to:", csv_filename_val)

ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2440, in predict_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2425, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2413, in run_step  **
        outputs = model.predict_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2381, in predict_step
        return self(x, training=False)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/input_spec.py", line 197, in assert_input_compatibility
        raise ValueError(

    ValueError: Missing data for input "input_7". You passed a data dictionary with keys ['image_tensor', 'caption']. Expected the following keys: ['input_7', 'words']


In [None]:
predictions_test = image_caption_train_model.predict(batched_test_ds)

predictions_dftest = pd.DataFrame(predictions_test)

# Save DataFrame to a CSV file
csv_filename_test = "/content/drive/My Drive/test_predictions.csv"
predictions_dftest.to_csv(csv_filename_test, index=False)
print("Shape of test predictions:", predictions_test.shape)

In [88]:
import json

def tokenizer_config_to_json(tokenizer_config):
    serialized_config = {}
    for key, value in tokenizer_config.items():
        if not callable(value):  # Exclude functions from serialization
            if isinstance(value, dict):
                serialized_value = tokenizer_config_to_json(value)  # Recursive call for nested dictionaries
            else:
                serialized_value = value
            serialized_config[key] = serialized_value
    return serialized_config

file_path = "/content/drive/My Drive/tokenizer.json"
tokenizer_config_json = tokenizer_config_to_json(tokenizer.get_config())
with open(file_path, 'w') as json_file:
    json.dump(tokenizer_config_json, json_file)

In [95]:


with open(file_path, 'r') as json_file:
    loaded_tokenizer_config = json.load(json_file)
print(loaded_tokenizer_config)
def custom_standardization(inputs):
    inputs = tf.strings.lower(inputs)
    return tf.strings.regex_replace(
        inputs, r"[!\"#$%&\(\)\*\+.,-/:;=?@\[\\\]^_`{|}~]?", ""
    )
# Reconstruct tokenizer from configuration
tokenizer = TextVectorization.from_config(loaded_tokenizer_config)
tokenizer.standardize = custom_standardization

{'name': 'text_vectorization_1', 'trainable': True, 'dtype': 'string', 'batch_input_shape': [], 'max_tokens': 20000, 'split': 'whitespace', 'ngrams': None, 'output_mode': 'int', 'output_sequence_length': 128, 'pad_to_max_tokens': False, 'sparse': False, 'ragged': False, 'vocabulary': None, 'idf_weights': None, 'encoding': 'utf-8', 'vocabulary_size': 2185}


KeyError: 'standardize'

In [None]:
gru_state_input = Input(shape=(ATTENTION_DIM), name="gru_state_input")

# Reuse trained GRU, but update it so that it can receive states.
gru_output, gru_state = decoder_gru(embed_x, initial_state=gru_state_input)

# Reuse other layers as well
context_vector = decoder_attention([gru_output, encoder_output])
addition_output = Add()([gru_output, context_vector])
layer_norm_output = layer_norm(addition_output)

decoder_output = decoder_output_dense(layer_norm_output)

# Define prediction Model with state input and output
decoder_pred_model = tf.keras.Model(
    inputs=[word_input, gru_state_input, encoder_output],
    outputs=[decoder_output, gru_state],
)


In [None]:
from keras.preprocessing import image
import json
def decode_predictions(predictions):
    generated_captions = []
    with open('tokenizer.json', 'r') as json_file:
        tokenizer_json = json.load(json_file)
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_json)
    for prediction in predictions:
        caption_words = []
        for token_id in prediction:
            word = tokenizer.index_word.get(token_id, '<UNK>')
            caption_words.append(word)
            if word == '<end>':
                break
        caption = ' '.join(caption_words[:-1])
        generated_captions.append(caption)

    return generated_captions






In [42]:
!pip install sacrebleu>=1.4.12

In [44]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [162]:
from nltk.translate.bleu_score import corpus_bleu

def compute_bleu_scores(predictions, references):
    references = [[ref.split()] for ref in references]
    predictions = [pred.split() for pred in predictions]
    bleu_score = corpus_bleu(references, predictions)
    return bleu_score

def evaluate_model(model, dataset):
    total_bleu_score = 0
    references_list = []
    predictions_list = []

    # Batch the dataset before iterating over it
    dataset = dataset.batch(BATCH_SIZE)

    for batch in dataset:
        image_tensors, captions = batch['image_tensor'], batch['caption']
        for image_tensor, caption in zip(image_tensors, captions):
            # Predict caption for the current image
            predicted_caption = predict_caption(tf.expand_dims(image_tensor, axis=0))

            # Add reference and prediction for BLEU score calculation
            references_list.append([caption.numpy().decode("utf-8")])
            predictions_list.append(predicted_caption)

    # Compute BLEU score
    bleu_score = compute_bleu_scores(predictions_list, references_list)

    return bleu_score


In [45]:
from google.colab import files
uploaded = files.upload()

Saving WhatsApp Image 2024-03-10 at 02.06.12.jpeg to WhatsApp Image 2024-03-10 at 02.06.12.jpeg


In [None]:
MINIMUM_SENTENCE_LENGTH = 5


## Probabilistic prediction using the trained model
def predict_caption(filename):
    gru_state = tf.zeros((1, ATTENTION_DIM))

    img = tf.image.decode_jpeg(tf.io.read_file(filename), channels=IMG_CHANNELS)
    img = tf.image.resize(img, (IMG_HEIGHT, IMG_WIDTH))
    img = img / 255

    features = encoder(tf.expand_dims(img, axis=0))
    dec_input = tf.expand_dims([word_to_index("<start>")], 1)
    result = []
    for i in range(MAX_CAPTION_LEN):
        predictions, gru_state = decoder_pred_model([dec_input, gru_state, features])

        # Get the index with maximum probability
        predicted_id = tf.argmax(predictions[0][0]).numpy()

        result.append(tokenizer.get_vocabulary()[predicted_id])

        if predicted_id == word_to_index("<end>"):
            break

        dec_input = tf.expand_dims([predicted_id], 1)

    return result

In [None]:
image_caption_train_model.summary()


In [None]:
filename = "/content/WhatsApp Image 2024-03-10 at 02.06.12.jpeg"
for i in range(5):
    image, caption = predict_caption(filename)
    print(" ".join(caption[:-1]) + ".")

img = tf.image.decode_jpeg(tf.io.read_file(filename), channels=IMG_CHANNELS)
plt.imshow(img)
plt.axis("off")




In [None]:
import tensorflow as tf
from keras.preprocessing import image
import numpy as np

# Load the saved model
saved_model_path = "/content/drive/My Drive/gru_model.h5"
image_caption_model = tf.keras.models.load_model(saved_model_path)

# Define a function to preprocess the image
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img) / 255.0
    img_array = np.expand_dims(img_array, axis=0)
    return img_array

# Define a function to generate captions for the new image
def generate_caption(img_array):
    predictions = image_caption_model.predict(img_array)
    # Decode the predicted output to generate captions
    # Replace this with your own decoding logic
    captions = decode_predictions(predictions)
    return captions

# Example usage
img_path = "/content/WhatsApp Image 2024-03-10 at 02.06.12.jpeg"
img_array = preprocess_image(img_path)
captions = generate_caption(img_array)
print("Generated captions:", captions)
