In [114]:
import platform
print(platform.machine())


arm64


In [115]:
import tensorflow as tf



In [116]:
import numpy as np
import string

In [117]:
from PIL import Image
import os
from pickle import dump, load
import numpy as np
import time
import matplotlib.pyplot as pl

In [118]:
from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, get_file
from keras.layers import add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout


In [119]:
# small library for seeing the progress of loops.
from tqdm.notebook import tqdm
tqdm.pandas()



In [120]:
# Loading a text file into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [121]:

# get all imgs with their captions
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions ={}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

In [122]:
#data cleaning
def cleaning_text(captions):
    table = str.maketrans('','',string.punctuation)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):

            img_caption.replace("-"," ")
            desc = img_caption.split()

            #converts to lowercase
            desc = [word.lower() for word in desc]
            #remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            #remove hanging 's and a
            desc = [word for word in desc if(len(word)>1)]
            #remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            #convert back to string

            img_caption = ' '.join(desc)
            captions[img][i]= img_caption
    return captions

In [123]:

def text_vocabulary(descriptions):
    # build vocabulary of all unique words
    vocab = set()

    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]

    return vocab

In [124]:
#All descriptions in one file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()


In [125]:
dataset_text = "Flickr8k_text 2"
dataset_images ="Flickr8k_Dataset"

In [126]:
import os

# Full file path
filename = os.path.join("Data", "Flickr8k_text 2", "Flickr8k.token.txt")

# Check if the file exists
if not os.path.exists(filename):
    print("❌ File not found at:", filename)
else:
    # Load and process the file
    descriptions = all_img_captions(filename)
    print("✅ Length of descriptions =", len(descriptions))


✅ Length of descriptions = 8092


In [127]:
clean_descriptions = cleaning_text(descriptions)


In [128]:
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))


Length of vocabulary =  8763


In [129]:
save_descriptions(clean_descriptions, "Image-Captioning/descriptions.txt")

In [130]:
import time
from tensorflow.keras.utils import get_file

def download_with_retry(url, filename, max_retries=3):
    for attempt in range(max_retries):
        try:
            return get_file(filename, url)
        except Exception as e:
            if attempt == max_retries - 1:
                raise e
            print(f"Download attempt {attempt + 1} failed. Retrying in 5 seconds...")
            time.sleep(5)

In [131]:
weights_url = "https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5"

In [132]:
weights_path = download_with_retry(weights_url, 'xception_weights.h5')
model = Xception(include_top=False, pooling='avg', weights=weights_path)


In [133]:
def extract_features(directory):
    features = {}
    valid_images = ['.jpg', '.jpeg', '.png']


In [134]:
from tqdm import tqdm
import os
from PIL import Image
import numpy as np

def extract_features(directory, model, valid_images={".jpg", ".jpeg", ".png"}):
    features = {}

    for img in tqdm(os.listdir(directory)):
        ext = os.path.splitext(img)[1].lower()
        if ext not in valid_images:
            continue

        filename = os.path.join(directory, img)
        image = Image.open(filename)
        image = image.resize((299, 299))
        image = np.expand_dims(image, axis=0)
        image = image / 127.5
        image = image - 1.0

        feature = model.predict(image)
        features[img] = feature

    return features


In [135]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.applications.inception_v3 import preprocess_input

# Load pre-trained model (excluding top layers)
#base_model = InceptionV3(weights='imagenet')
#model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)


In [136]:
dataset_images = "Data/Flickr8k_Dataset/Flicker8k_Dataset"


In [137]:
import os

if not os.path.exists(dataset_images):
    raise FileNotFoundError(f"Folder does not exist: {dataset_images}")

In [138]:
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.preprocessing import image
import numpy as np
import os
from tqdm import tqdm

def extract_features(directory, model, valid_images={".jpg", ".jpeg", ".png"}):
    features = {}

    for img_name in tqdm(os.listdir(directory)):
        ext = os.path.splitext(img_name)[1].lower()
        if ext not in valid_images:
            continue

        img_path = os.path.join(directory, img_name)
        try:
            img = image.load_img(img_path, target_size=(299, 299))  # For Xception
            img_array = image.img_to_array(img)
            img_array = np.expand_dims(img_array, axis=0)
            img_array = preprocess_input(img_array)

            feature = model.predict(img_array, verbose=0)
            features[img_name] = feature.flatten()
        except Exception as e:
            print(f"[ERROR] Skipping {img_name}: {e}")

    return features


In [139]:
dataset_images = "Data/Flickr8k_Dataset/Flicker8k_Dataset"

# Extract and save
features = extract_features(dataset_images, model)
dump(features, open("Data/features.p", "wb"))

# Load later
features = load(open("Data/features.p", "rb"))

100%|██████████| 8091/8091 [08:52<00:00, 15.20it/s]


In [140]:
def load_clean_descriptions(filepath, dataset):
    descriptions = {}
    with open(filepath, 'r') as file:
        for line in file:
            tokens = line.strip().split()
            if len(tokens) < 2:
                continue
            image_id, image_desc = tokens[0], tokens[1:]
            image_id = image_id.split('.')[0]  # Removes .jpg
            if image_id in dataset:
                if image_id not in descriptions:
                    descriptions[image_id] = []
                desc = ' '.join(image_desc)
                descriptions[image_id].append(desc)
    return descriptions


In [141]:
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    photos_present = [photo for photo in photos if os.path.exists(os.path.join(dataset_images, photo))]
    return photos_present


In [142]:

def load_features(photos):
    #loading all features
    all_features = load(open("features.p","rb"))
    #selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features


In [143]:
def loading_data(filename):
    with open(filename, 'r') as file:
        text = file.read()
    # Split into lines and get only image IDs (removing extensions)
    images = text.strip().split('\n')
    image_ids = [img.split('.')[0] for img in images]
    return image_ids


In [144]:
import os

# 1. Use correct path (update based on actual folder name)
filename = "Data/Flickr8k_text 2/Flickr_8k.trainImages.txt"

# 2. Ensure the file exists
if not os.path.exists(filename):
    raise FileNotFoundError(f"Missing file: {filename}")

# 3. Load training image IDs
def loading_data(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return [line.strip().split('.')[0] for line in text.strip().split('\n')]

train = loading_data(filename)

# 4. Load photos by IDs
def load_photos(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return [line.strip().split('.')[0] for line in text.strip().split('\n')]

train_imgs = load_photos(filename)

# 5. Load cleaned descriptions
def load_clean_descriptions(filepath, dataset):
    with open(filepath, 'r') as file:
        descriptions = {}
        for line in file:
            tokens = line.strip().split()
            img_id, img_desc = tokens[0], tok


In [145]:
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc


In [146]:
print("Train Image IDs:", train_imgs[:5])


Train Image IDs: ['2513260012_03d33305cf', '2903617548_d3e38d7f88', '3338291921_fe7ae0c8f8', '488416045_1c6d903fe0', '2644326817_8f45080b87']


In [147]:
with open("Image-Captioning/descriptions.txt", "r") as file:
    for _ in range(5):
        print(file.readline())


1000268201_693b08cb0e.jpg	child in pink dress is climbing up set of stairs in an entry way

1000268201_693b08cb0e.jpg	girl going into wooden building

1000268201_693b08cb0e.jpg	little girl climbing into wooden playhouse

1000268201_693b08cb0e.jpg	little girl climbing the stairs to her playhouse

1000268201_693b08cb0e.jpg	little girl in pink dress going into wooden cabin



In [148]:
def load_clean_descriptions(filepath, dataset):
    descriptions = {}
    found = 0
    with open(filepath, 'r') as file:
        for line in file:
            tokens = line.strip().split()
            if len(tokens) < 2:
                continue
            image_id, image_desc = tokens[0], tokens[1:]
            image_id_clean = image_id.split('.')[0]
            if image_id_clean in dataset:
                found += 1
                if image_id_clean not in descriptions:
                    descriptions[image_id_clean] = []
                desc = ' '.join(image_desc)
                descriptions[image_id_clean].append(desc)
    print(f"Matched {found} descriptions from the dataset.")
    return descriptions


In [149]:
train_descriptions = load_clean_descriptions("Image-Captioning/descriptions.txt", train_imgs)
print("Sample description keys:", list(train_descriptions.keys())[:5])


Matched 30000 descriptions from the dataset.
Sample description keys: ['1000268201_693b08cb0e', '1001773457_577c3a7d70', '1002674143_1b742ab4b8', '1003163366_44323f5815', '1007129816_e794419615']


In [150]:
train_descriptions.keys()

dict_keys(['1000268201_693b08cb0e', '1001773457_577c3a7d70', '1002674143_1b742ab4b8', '1003163366_44323f5815', '1007129816_e794419615', '1007320043_627395c3d8', '1009434119_febe49276a', '1012212859_01547e3f17', '1015118661_980735411b', '1015584366_dfcec3c85a', '101654506_8eb26cfb60', '101669240_b2d3e7f17b', '1016887272_03199f49c4', '1019077836_6fc9b15408', '1019604187_d087bf9a5f', '1020651753_06077ec457', '1022454428_b6b660a67b', '1022975728_75515238d8', '102351840_323e3de834', '1024138940_f1fefbdce1', '102455176_5f8ead62d5', '1026685415_0431cbf574', '1028205764_7e8df9a2ea', '1030985833_b0902ea560', '103195344_5d2dc613a3', '103205630_682ca7285b', '1032460886_4a598ed535', '104136873_5b5d41be75', '1042020065_fb3d3ba5ba', '1042590306_95dea0916c', '1048710776_bb5b0a5c7c', '1052358063_eae6744153', '105342180_4d4a40b47f', '1053804096_ad278b25f1', '1055623002_8195a43714', '1055753357_4fa3d8d693', '1056359656_662cee0814', '1056873310_49c665eb22', '1057089366_ca83da0877', '1057210460_09c6f4c6c1

In [151]:
from tensorflow.keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions):
    lines = []
    for desc_list in descriptions.values():
        lines.extend(desc_list)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


In [152]:
tokenizer = create_tokenizer(train_descriptions)
print("Sample word index:", list(tokenizer.word_index.items())[:10])
vocab_size = len(tokenizer.word_index) + 1
print("Vocab size:", vocab_size)


Sample word index: [('in', 1), ('the', 2), ('on', 3), ('is', 4), ('and', 5), ('dog', 6), ('with', 7), ('man', 8), ('of', 9), ('two', 10)]
Vocab size: 7577


In [153]:
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(train_descriptions)
print(max_length)


32


In [154]:
train_descriptions

{'1000268201_693b08cb0e': ['child in pink dress is climbing up set of stairs in an entry way',
  'girl going into wooden building',
  'little girl climbing into wooden playhouse',
  'little girl climbing the stairs to her playhouse',
  'little girl in pink dress going into wooden cabin'],
 '1001773457_577c3a7d70': ['black dog and spotted dog are fighting',
  'black dog and tricolored dog playing with each other on the road',
  'black dog and white dog with brown spots are staring at each other in the street',
  'two dogs of different breeds looking at each other on the road',
  'two dogs on pavement moving toward each other'],
 '1002674143_1b742ab4b8': ['little girl covered in paint sits in front of painted rainbow with her hands in bowl',
  'little girl is sitting in front of large painted rainbow',
  'small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it',
  'there is girl with pigtails sitting in front of rainbow painting',
  'young girl with pi

In [1]:
def data_generator(descriptions, features, tokenizer, max_length):
    while True:
        for key, description_list in descriptions.items():
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(
                tokenizer, max_length, description_list, feature
            )
            for i in range(len(input_image)):
                yield ({'input_1': input_image[i], 'input_2': input_sequence[i]}, output_word[i])


In [156]:
all_features = load(open("Data/features.p", "rb"))


In [157]:
import os
print("File exists?", os.path.exists("Data/features.p"))


File exists? True


In [158]:
from pickle import load

def load_features(photos):
    all_features = load(open("Data/features.p", "rb"))
    print(all_features)
    features = {k: all_features[k] for k in photos if k in all_features}
    return features


In [159]:
# Load features only for the training images
train_features = load_features(train_imgs)

# Define output signature for tf.data.Dataset
output_signature = (
    {
        'input_1': tf.TensorSpec(shape=(2048,), dtype=tf.float32),
        'input_2': tf.TensorSpec(shape=(max_length,), dtype=tf.int32)
    },
    tf.TensorSpec(shape=(vocab_size,), dtype=tf.float32)
)

# Create the generator function
gen = data_generator(train_descriptions, train_features, tokenizer, max_length)

# Build tf.data.Dataset
dataset = tf.data.Dataset.from_generator(
    gen,
    output_signature=output_signature
)

# Batch the dataset
dataset = dataset.batch(32)


{'2387197355_237f6f41ee.jpg': array([0.39725822, 0.3979755 , 0.4062515 , ..., 0.03228263, 0.24208975,
       0.25930053], dtype=float32), '2609847254_0ec40c1cce.jpg': array([0.72011304, 0.89264107, 0.7368498 , ..., 0.8861274 , 0.6436659 ,
       0.16261266], dtype=float32), '2046222127_a6f300e202.jpg': array([0.15686972, 0.18843442, 0.46955705, ..., 1.8327725 , 0.31223977,
       0.37589496], dtype=float32), '2853743795_e90ebc669d.jpg': array([0.05032302, 0.48652196, 0.11849521, ..., 0.33112624, 0.37729025,
       0.07065029], dtype=float32), '2696951725_e0ae54f6da.jpg': array([0.487068  , 0.3599441 , 0.65353626, ..., 0.2739367 , 0.594561  ,
       0.6213752 ], dtype=float32), '3421131122_2e4bde661e.jpg': array([0.03816597, 0.32926828, 0.34832576, ..., 0.21024328, 0.23570502,
       0.62096405], dtype=float32), '3229730008_63f8ca2de2.jpg': array([0.4003595 , 0.45116502, 0.17302483, ..., 0.71083367, 1.0641621 ,
       0.46303248], dtype=float32), '3220009216_10f088185e.jpg': array([0.39

In [160]:
def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # 🔥 No to_categorical here — use sparse integer directly
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)


In [161]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')


In [162]:
len(train_descriptions)

6000

In [163]:
train_features

{}

In [164]:
# Step 0: Ensure matching image IDs
train_descriptions = {k: v for k, v in train_descriptions.items() if k in train_features}

# Step 1: Create the generator function
gen = data_generator(train_descriptions, train_features, tokenizer, max_length)

# Step 2: Define output signature
output_signature = (
    {
        'input_1': tf.TensorSpec(shape=(2048,), dtype=tf.float32),
        'input_2': tf.TensorSpec(shape=(max_length,), dtype=tf.int32)
    },
    tf.TensorSpec(shape=(vocab_size,), dtype=tf.float32)
)

# Step 3: Convert to tf.data.Dataset
dataset = tf.data.Dataset.from_generator(
    gen,
    output_signature=output_signature
)

# Step 4: Batch
dataset = dataset.batch(32)


In [165]:
output_signature = (
    {
        'input_1': tf.TensorSpec(shape=(2048,), dtype=tf.float32),
        'input_2': tf.TensorSpec(shape=(max_length,), dtype=tf.int32)
    },
    tf.TensorSpec(shape=(), dtype=tf.int32)  # sparse label: just one integer
)


dataset = tf.data.Dataset.from_generator(
    gen,
    output_signature=output_signature
)


dataset = dataset.batch(32)

In [166]:
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LSTM, add
from tensorflow.keras.models import Model

def define_model(vocab_size, max_length):
    # Image feature extractor model (2048-D feature vector)
    inputs1 = Input(shape=(2048,), name='input_1')
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Text sequence model
    inputs2 = Input(shape=(max_length,), name='input_2', dtype='int32')  # ✅ Fixed dtype
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # Full model
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')  # ✅ for integer labels
    return model




In [167]:
model = define_model(vocab_size, max_length)

# Save architecture
model_json = model.to_json()
with open("model.json", "w") as f:
    f.write(model_json)


In [168]:
 print(model.summary())



None


In [169]:
import netron

netron.start("model.json")

Serving 'model.json' at http://localhost:8081


('localhost', 8081)

In [170]:
import os
print(os.listdir())


['test_graph.png', 'model.json', 'main.ipynb', '.DS_Store', 'uv.lock', 'pyproject.toml', 'models', 'test.py', 'models2', 'README.md', '.venv', '.python-version', 'Image-Captioning', 'descriptions.txt', 'Data', '.idea']


In [171]:
def load_doc(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

def load_descriptions(doc):
    mapping = {}
    for line in doc.strip().split('\n'):
        tokens = line.split()
        if len(tokens) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]  # remove '.jpg'
        desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = []
        mapping[image_id].append(desc)
    return mapping


In [172]:
doc = load_doc("Data/Flickr8k_text 2/Flickr8k.token.txt")
descriptions = load_descriptions(doc)
print("Descriptions loaded:", len(descriptions))  # should be ~8000
  # should be ~8000


Descriptions loaded: 8092


In [173]:
def clean_descriptions(descriptions):
    import string
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.lower()
            desc = desc.translate(table)
            desc = ' '.join([word for word in desc.split() if len(word) > 1 and word.isalpha()])
            desc_list[i] = desc

clean_descriptions(descriptions)


In [174]:
def save_descriptions(descriptions, filename):
    lines = []
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(f"{key}\t{desc}")
    with open(filename, 'w') as file:
        file.write('\n'.join(lines))

save_descriptions(descriptions, 'descriptions.txt')


In [175]:
train_features = features
train_descriptions = descriptions


In [176]:
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)



Dataset:  6000
Descriptions: train= 8092
Photos: train= 8091
Vocabulary Size: 7577
Description Length:  32


In [177]:
# Define the model
model = define_model(vocab_size, max_length)
epochs = 10
batch_size = 32

# Dynamically compute steps per epoch based on training descriptions
def get_steps_per_epoch(descriptions, batch_size=32):
    total_sequences = sum(
        len(caption.split()) - 1
        for captions in descriptions.values()
        for caption in captions
    )
    # Ensure minimum one step to avoid division errors
    return max(1, total_sequences // batch_size)

# Compute steps
steps = get_steps_per_epoch(train_descriptions, batch_size)


In [None]:
try:
    model.fit(dataset, epochs=4, steps_per_epoch=steps, verbose=1)
except ValueError as e:
    print("ValueError:", e)


Epoch 1/4


In [68]:
import os

os.makedirs("models", exist_ok=True)

for i in range(epochs):
    dataset = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit(dataset, epochs=4, steps_per_epoch=steps, verbose=1)
    model.save("models/model_" + str(i) + ".h5")


ValueError: Unrecognized data type: x=<function data_generator.<locals>.generator at 0x3150140d0> (of type <class 'function'>)