In [1]:
import os
import string
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from pickle import dump, load
from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import tensorflow as tf

# ------------------------- Text Preprocessing -------------------------

def load_captions_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    descriptions = {}
    for _, row in df.iterrows():
        img_id = str(row["image_id"]).split('.')[0]
        cap = row["caption"]
        descriptions.setdefault(img_id, []).append(cap)
    return descriptions

def clean_captions(captions):
    table = str.maketrans('', '', string.punctuation)
    for img_id, caps in captions.items():
        for i in range(len(caps)):
            text = caps[i].replace("-", " ").lower()
            words = text.split()
            words = [w.translate(table) for w in words if w.isalpha() and len(w) > 1]
            caps[i] = 'startseq ' + ' '.join(words) + ' endseq'
    return captions

def save_descriptions(captions, filename="descriptions.txt"):
    lines = []
    for img_id, desc_list in captions.items():
        for desc in desc_list:
            lines.append(f"{img_id}\t{desc}")
    with open(filename, "w") as f:
        f.write("\n".join(lines))

# ------------------------- Feature Extraction -------------------------

def extract_features(directory, image_names=None):
    model = Xception(include_top=False, pooling='avg')
    features = {}
    image_list = os.listdir(directory)
    if image_names:
        image_list = [img for img in image_list if img.split('.')[0] in image_names]
    for img_name in tqdm(image_list, desc="Extracting features"):
        img_path = os.path.join(directory, img_name)
        image = load_img(img_path, target_size=(299, 299))
        image = img_to_array(image)
        image = np.expand_dims(image, axis=0)
        image = preprocess_input(image)
        feature = model.predict(image, verbose=0)
        features[img_name.split('.')[0]] = feature
    return features

# ------------------------- Tokenizer and Sequences -------------------------

def create_tokenizer(descriptions):
    all_desc = [d for descs in descriptions.values() for d in descs]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_desc)
    return tokenizer

def max_caption_length(descriptions):
    all_desc = [d for descs in descriptions.values() for d in descs]
    return max(len(d.split()) for d in all_desc)

def create_sequences(tokenizer, max_len, desc_list, feature):
    x_img, x_seq, y = [], [], []
    vocab_size = len(tokenizer.word_index) + 1
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_len, padding='post')[0]  
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            x_img.append(feature)
            x_seq.append(in_seq)
            y.append(out_seq)
    return np.array(x_img), np.array(x_seq), np.array(y)

def data_generator(descriptions, features, tokenizer, max_len):
    vocab_size = len(tokenizer.word_index) + 1
    def gen():
        for img_id, desc_list in descriptions.items():
            feature = features[img_id][0]
            x_img, x_seq, y = create_sequences(tokenizer, max_len, desc_list, feature)
            for i in range(len(x_img)):
                yield (x_img[i], x_seq[i]), y[i]
    output_signature = (
        (tf.TensorSpec([2048], tf.float32), tf.TensorSpec([max_len], tf.int32)),
        tf.TensorSpec([vocab_size], tf.float32)
    )
    return tf.data.Dataset.from_generator(gen, output_signature=output_signature)

# ------------------------- Model Definition -------------------------

def define_model(vocab_size, max_len):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    inputs2 = Input(shape=(max_len,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256, unroll=True)(se2)

    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# ------------------------- Training -------------------------

def train_model(descriptions, features, tokenizer, max_len, epochs=10):
    os.makedirs("models", exist_ok=True)
    vocab_size = len(tokenizer.word_index) + 1
    model = define_model(vocab_size, max_len)
    
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath="models/cp.keras",
        save_best_only=True,
        monitor='loss',
        mode='min',
        save_weights_only=False
    )
    
    dataset = data_generator(descriptions, features, tokenizer, max_len).batch(32).repeat()
    steps = sum(len(v) for v in descriptions.values()) // 32
    
    model.fit(dataset, epochs=epochs, steps_per_epoch=steps, callbacks=[cp_callback])
    model.save("models/model_final.keras") 
    return model

# ------------------------- Caption Generation -------------------------

def generate_caption(model, tokenizer, photo, max_len):
    in_text = 'startseq'
    for _ in range(max_len):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_len)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = next((w for w, i in tokenizer.word_index.items() if i == yhat), None)
        if not word:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# ------------------------- Submission Creation -------------------------

def create_submission(test_img_dir, model_path, tokenizer, max_len, test_csv_path, output_csv='submission.csv'):
    model = tf.keras.models.load_model(model_path)
    test_df = pd.read_csv(test_csv_path, dtype={'image_id': str})
    image_ids = [img_id.replace('.jpg', '') for img_id in test_df['image_id']]
    
    features = extract_features(test_img_dir, image_names=image_ids)
    
    captions = []
    for img_id in tqdm(image_ids, desc="Generating captions"):
        feature = features.get(img_id, np.zeros((1, 2048)))
        caption = generate_caption(model, tokenizer, feature, max_len)
        captions.append(caption)
    
    pd.DataFrame({'image_id': test_df['image_id'], 'caption': captions}).to_csv(output_csv, index=False)
    print(f"✅ Submission saved to {output_csv}")

# ------------------------- Main (Kaggle notebook uyumlu) -------------------------
if __name__ == "__main__":
    base_dir = "/kaggle/input/obss-intern-competition-2025"
    train_csv = os.path.join(base_dir, "train.csv")
    test_csv = os.path.join(base_dir, "test.csv")
    train_dir = os.path.join(base_dir, "train/train")
    test_dir = os.path.join(base_dir, "test/test")

    all_desc = load_captions_from_csv(train_csv)
    cleaned_desc = clean_captions(all_desc)
    save_descriptions(cleaned_desc)

    features = extract_features(train_dir, image_names=cleaned_desc.keys())
    dump(features, open("features.p", "wb"))

    tokenizer = create_tokenizer(cleaned_desc)
    max_len = max_caption_length(cleaned_desc)

    features = load(open("features.p", "rb"))
    train_features = {k: features[k] for k in cleaned_desc}

    model = train_model(cleaned_desc, train_features, tokenizer, max_len)

    create_submission(
        test_img_dir=test_dir,
        model_path="models/cp.keras",
        tokenizer=tokenizer,
        max_len=max_len,
        test_csv_path=test_csv
    )


2025-05-20 18:32:54.321528: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747765974.577895      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747765974.648934      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1747765991.731594      20 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1747765991.732424      20 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability:

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m83683744/83683744[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


I0000 00:00:1747765997.191599      62 service.cc:148] XLA service 0x7a3ef0003460 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1747765997.192945      62 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1747765997.192970      62 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1747765997.687721      62 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1747766000.683111      62 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
Extracting features: 100%|██████████| 21367/21367 [37:21<00:00,  9.53it/s]


Epoch 1/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 18ms/step - loss: 6.8640
Epoch 2/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 18ms/step - loss: 5.7969
Epoch 3/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - loss: 5.5133
Epoch 4/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - loss: 5.3518
Epoch 5/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 18ms/step - loss: 5.2221
Epoch 6/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - loss: 5.1298
Epoch 7/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - loss: 5.0887
Epoch 8/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - loss: 5.0097
Epoch 9/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - loss: 4.9885
Epoch 10/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11

Extracting features: 100%|██████████| 3771/3771 [06:26<00:00,  9.75it/s]
Generating captions: 100%|██████████| 3771/3771 [1:11:17<00:00,  1.13s/it]

✅ Submission saved to submission.csv



