In [42]:
from utils.utils import load_captions_txt

captions_dict = load_captions_txt("../data/Flickr8k_text/captions.txt")

# Poi estrai la lista flat delle caption per il tokenizer
all_captions = []
for caps in captions_dict.values():
    all_captions.extend(caps)

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token="<unk>")
tokenizer.fit_on_texts(all_captions)

vocab_size = len(tokenizer.word_index) + 1

def max_caption_length(captions):
    return max(len(c.split()) for c in captions)

max_length = max_caption_length(all_captions)

print(f"Vocabulary size: {vocab_size}")
print(f"Max caption length: {max_length}")


Vocabulary size: 8497
Max caption length: 40


In [43]:
import pickle
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.optimizers import Adam

# Carica feature estratte (encoder output)
with open("features/image_features.pkl", "rb") as f:
    image_features = pickle.load(f)

# Input feature immagine
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# Input sequenze testo (caption)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

# Combina
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# Modello finale
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

model.summary()


In [44]:
import tensorflow as tf
from utils.utils import data_generator

batch_size = 64

output_signature = (
    (
        tf.TensorSpec(shape=(None, 2048), dtype=tf.float32),    # Feature immagini
        tf.TensorSpec(shape=(None, max_length), dtype=tf.int32) # Sequenze input
    ),
    tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32)   # Target one-hot
)

import functools

# Usa functools.partial per passare gli argomenti in modo sicuro
# Questo crea un nuovo oggetto funzione che ha già tutti gli argomenti "congelati" al suo interno
partial_generator = functools.partial(
    data_generator,
    captions_dict, 
    image_features, 
    tokenizer, 
    max_length, 
    vocab_size, 
    batch_size
)

# Ora passa questo nuovo oggetto a from_generator
dataset = tf.data.Dataset.from_generator(
    partial_generator, # Nota: non lo chiami, passi l'oggetto funzione
    output_signature=output_signature
)

steps = len(captions_dict) // batch_size

for batch in dataset.take(1):
    (X1, X2), y = batch
    print("\n--- BATCH RICEVUTO CORRETTAMENTE DA TENSORFLOW ---")
    print("X1 shape:", X1.shape)
    print("X2 shape:", X2.shape)
    print("y shape:", y.shape)


InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_3_device_/job:localhost/replica:0/task:0/device:CPU:0}} TypeError: `generator` yielded an element that did not match the expected structure. The expected structure was ((tf.float32, tf.int32), tf.float32), but the yielded element was ([array([[0.24745122, 0.20485741, 0.18421546, ..., 0.07847853, 0.54868805,
        1.3474345 ],
       [0.24745122, 0.20485741, 0.18421546, ..., 0.07847853, 0.54868805,
        1.3474345 ],
       [0.24745122, 0.20485741, 0.18421546, ..., 0.07847853, 0.54868805,
        1.3474345 ],
       ...,
       [0.2260996 , 0.17773649, 0.22333072, ..., 0.31119516, 0.18079863,
        0.50760293],
       [0.2260996 , 0.17773649, 0.22333072, ..., 0.31119516, 0.18079863,
        0.50760293],
       [0.2260996 , 0.17773649, 0.22333072, ..., 0.31119516, 0.18079863,
        0.50760293]], dtype=float32), array([[   0,    0,    0, ...,    0,    0,    3],
       [   0,    0,    0, ...,    0,    3,    2],
       [   0,    0,    0, ...,    3,    2,   56],
       ...,
       [   0,    0,    0, ...,    3,    2,   10],
       [   0,    0,    0, ...,    2,   10, 4549],
       [   0,    0,    0, ...,   10, 4549,   23]], dtype=int32)], array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])).
Traceback (most recent call last):

  File "c:\Users\chiar\OneDrive\Desktop\python\.venv\Lib\site-packages\tensorflow\python\data\ops\from_generator_op.py", line 204, in generator_py_func
    flattened_values = nest.flatten_up_to(output_types, values)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "c:\Users\chiar\OneDrive\Desktop\python\.venv\Lib\site-packages\tensorflow\python\data\util\nest.py", line 237, in flatten_up_to
    return nest_util.flatten_up_to(
           ^^^^^^^^^^^^^^^^^^^^^^^^

  File "c:\Users\chiar\OneDrive\Desktop\python\.venv\Lib\site-packages\tensorflow\python\util\nest_util.py", line 1541, in flatten_up_to
    return _tf_data_flatten_up_to(shallow_tree, input_tree)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "c:\Users\chiar\OneDrive\Desktop\python\.venv\Lib\site-packages\tensorflow\python\util\nest_util.py", line 1570, in _tf_data_flatten_up_to
    _tf_data_assert_shallow_structure(shallow_tree, input_tree)

  File "c:\Users\chiar\OneDrive\Desktop\python\.venv\Lib\site-packages\tensorflow\python\util\nest_util.py", line 1444, in _tf_data_assert_shallow_structure
    _tf_data_assert_shallow_structure(

  File "c:\Users\chiar\OneDrive\Desktop\python\.venv\Lib\site-packages\tensorflow\python\util\nest_util.py", line 1414, in _tf_data_assert_shallow_structure
    raise TypeError(

TypeError: If shallow structure is a sequence, input must also be a sequence. Input has type: 'list'.


The above exception was the direct cause of the following exception:


Traceback (most recent call last):

  File "c:\Users\chiar\OneDrive\Desktop\python\.venv\Lib\site-packages\tensorflow\python\ops\script_ops.py", line 269, in __call__
    ret = func(*args)
          ^^^^^^^^^^^

  File "c:\Users\chiar\OneDrive\Desktop\python\.venv\Lib\site-packages\tensorflow\python\autograph\impl\api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "c:\Users\chiar\OneDrive\Desktop\python\.venv\Lib\site-packages\tensorflow\python\data\ops\from_generator_op.py", line 206, in generator_py_func
    raise TypeError(

TypeError: `generator` yielded an element that did not match the expected structure. The expected structure was ((tf.float32, tf.int32), tf.float32), but the yielded element was ([array([[0.24745122, 0.20485741, 0.18421546, ..., 0.07847853, 0.54868805,
        1.3474345 ],
       [0.24745122, 0.20485741, 0.18421546, ..., 0.07847853, 0.54868805,
        1.3474345 ],
       [0.24745122, 0.20485741, 0.18421546, ..., 0.07847853, 0.54868805,
        1.3474345 ],
       ...,
       [0.2260996 , 0.17773649, 0.22333072, ..., 0.31119516, 0.18079863,
        0.50760293],
       [0.2260996 , 0.17773649, 0.22333072, ..., 0.31119516, 0.18079863,
        0.50760293],
       [0.2260996 , 0.17773649, 0.22333072, ..., 0.31119516, 0.18079863,
        0.50760293]], dtype=float32), array([[   0,    0,    0, ...,    0,    0,    3],
       [   0,    0,    0, ...,    0,    3,    2],
       [   0,    0,    0, ...,    3,    2,   56],
       ...,
       [   0,    0,    0, ...,    3,    2,   10],
       [   0,    0,    0, ...,    2,   10, 4549],
       [   0,    0,    0, ...,   10, 4549,   23]], dtype=int32)], array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])).


	 [[{{node PyFunc}}]] [Op:IteratorGetNext] name: 

In [None]:
import numpy as np

print("--- Inizio scansione di integrità dei dati ---")

# 1. Trova le chiavi (img_id) che sono in un dizionario ma non nell'altro
captions_keys = set(captions_dict.keys())
features_keys = set(image_features.keys())

if captions_keys == features_keys:
    print("OK: Le chiavi in 'captions_dict' e 'image_features' corrispondono perfettamente.")
else:
    print("!!! ATTENZIONE: Discrepanza nelle chiavi!")
    # Chiavi nelle caption ma non nelle feature
    captions_missing_features = captions_keys - features_keys
    if captions_missing_features:
        print(f"  Trovate {len(captions_missing_features)} chiavi in 'captions_dict' che mancano in 'image_features'. Esempio: {list(captions_missing_features)[:5]}")

    # Chiavi nelle feature ma non nelle caption
    features_missing_captions = features_keys - captions_keys
    if features_missing_captions:
        print(f"  Trovate {len(features_missing_captions)} chiavi in 'image_features' che mancano in 'captions_dict'. Esempio: {list(features_missing_captions)[:5]}")

# 2. Verifica il contenuto di 'image_features'
problemi_features = []
for img_id, feature in image_features.items():
    # Controlla se il valore non è un array NumPy
    if not isinstance(feature, np.ndarray):
        problemi_features.append(f"ID: {img_id} - Il valore non è un array NumPy, ma di tipo {type(feature)}")
        continue
        
    # Controlla se la forma è diversa da (1, 2048) o (2048,)
    squeezed_shape = np.squeeze(feature).shape
    if squeezed_shape != (2048,):
        problemi_features.append(f"ID: {img_id} - Forma anomala. Originale: {feature.shape}, Squeezed: {squeezed_shape}")

if not problemi_features:
    print("OK: Tutte le feature in 'image_features' sembrano avere forma e tipo corretti.")
else:
    print(f"\n!!! TROVATI {len(problemi_features)} PROBLEMI in 'image_features':")
    for p in problemi_features[:10]: # Stampa i primi 10 problemi trovati
        print(f"  - {p}")

# 3. Verifica il contenuto di 'captions_dict'
problemi_captions = []
for img_id, caps in captions_dict.items():
    if not isinstance(caps, list) or not all(isinstance(c, str) for c in caps):
         problemi_captions.append(f"ID: {img_id} - Il valore non è una lista di stringhe.")

if not problemi_captions:
    print("OK: Tutte le caption in 'captions_dict' sembrano essere nel formato corretto (lista di stringhe).")
else:
    print(f"\n!!! TROVATI {len(problemi_captions)} PROBLEMI in 'captions_dict':")
    for p in problemi_captions[:10]:
         print(f"  - {p}")

print("\n--- Scansione completata ---")

--- Controllo delle variabili di forma ---
Valore di test_vocab_size: 8497, Tipo: <class 'int'>
Valore di test_max_length: 40, Tipo: <class 'int'>
Valore di test_batch_size: 64, Tipo: <class 'int'>
-----------------------------------------


--- Tentativo di iterare sul DUMMY dataset ---
>>> Dummy Generator: Inizio a produrre un batch...
>>> Dummy Generator: Sto per fare 'yield' di un batch con forme:
    X1: (64, 2048), float32
    X2: (64, 40), int32
    y:  (64, 8497), float32

--- SUCCESSO! Batch fittizio ricevuto correttamente! ---
X1 shape: (64, 2048)
X2 shape: (64, 40)
y shape: (64, 8497)
-----------------------------------------------------


In [None]:
model.fit(
    dataset,
    epochs=20,
    steps_per_epoch=steps,
    verbose=1
)

# Salva modello
model.save("model_caption.h5")

# Salva tokenizer in json
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w') as f:
    f.write(tokenizer_json)
