In [None]:
!pip install tensorflow



In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D,TimeDistributed, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.layers import Embedding, LSTM, add, Concatenate, Reshape, concatenate, Bidirectional
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet201, EfficientNetB7
from tensorflow.keras.applications.efficientnet import preprocess_input as ef_preprocess_input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap
import re
import json
from google.colab import files
from glob import glob
from PIL import Image
from tensorflow.keras.utils import load_img, img_to_array
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input as mo_preprocess_input
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"gmrmsy","key":"0da5e2fc59385c60fce8551bcc4eac0f"}'}

In [8]:
# kaggle.json을 ~/.kaggle/kaggle.json 위치로 이동
os.makedirs("/root/.kaggle", exist_ok=True)
!mv kaggle.json /root/.kaggle/kaggle.json

# 권한 설정
os.chmod("/root/.kaggle/kaggle.json", 600)

In [9]:
# 'adityajn105/flickr8k' 데이터셋 다운로드
dataset_path = kagglehub.dataset_download("adityajn105/flickr8k")

In [10]:
import os

# 실제 경로 출력
print(dataset_path)

# 파일 목록 보기
os.listdir(dataset_path)

/kaggle/input/flickr8k


['captions.txt', 'Images']

In [38]:
# 파일 경로
caption_path = '/kaggle/input/flickr8k/captions.txt'

# (image_filename, caption) 쌍 리스트 만들기
data_pairs = []

with open(caption_path, 'r') as f:
    for line in f.readlines():
        if 'jpg,' not in line:
            continue
        parts = line.strip().split(',', maxsplit=1)
        image_file = parts[0]
        caption = parts[1]
        data_pairs.append((image_file, caption))

In [39]:
data_pairs[:5]

[('1000268201_693b08cb0e.jpg',
  'A child in a pink dress is climbing up a set of stairs in an entry way .'),
 ('1000268201_693b08cb0e.jpg', 'A girl going into a wooden building .'),
 ('1000268201_693b08cb0e.jpg',
  'A little girl climbing into a wooden playhouse .'),
 ('1000268201_693b08cb0e.jpg',
  'A little girl climbing the stairs to her playhouse .'),
 ('1000268201_693b08cb0e.jpg',
  'A little girl in a pink dress going into a wooden cabin .')]

In [40]:
captions = [c for _, c in data_pairs]

tokenizer = Tokenizer(filters='.', lower=True, oov_token='<unk>')
tokenizer.fit_on_texts(["<start> " + c + " <end>" for c in captions])

vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(c.split()) for c in captions) + 2  # <start>, <end> 포함

In [41]:
max_len, vocab_size

(40, 9182)

In [42]:
from tensorflow.keras.utils import Sequence

class ImageCaptionGenerator(Sequence):
    def __init__(self, data_pairs, tokenizer, image_dir,
                 batch_size=32, input_size=(224, 224), max_len=30, **kwargs):
        super().__init__(**kwargs)
        self.data_pairs = data_pairs
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.input_size = input_size
        self.max_len = max_len
        self.image_dir = image_dir

    def __len__(self):
        return len(self.data_pairs) // self.batch_size

    def __getitem__(self, idx):
        batch_pairs = random.sample(self.data_pairs, self.batch_size)

        image_batch = []
        decoder_input_batch = []
        decoder_target_batch = []

        for img_file, caption in batch_pairs:
            # 이미지 처리
            img_path = os.path.join(self.image_dir, img_file)
            img = load_img(img_path, target_size=self.input_size)
            img_array = img_to_array(img)
            img_array = mo_preprocess_input(img_array)
            image_batch.append(img_array)

            # 캡션 처리
            seq = self.tokenizer.texts_to_sequences(["<start> " + caption + " <end>"])[0]
            dec_input = seq[:-1]
            dec_target = seq[1:]

            # 패딩
            dec_input = pad_sequences([dec_input], maxlen=self.max_len, padding='post')[0]
            dec_target = pad_sequences([dec_target], maxlen=self.max_len, padding='post')[0]
            dec_target = np.concatenate([dec_target, [0]],axis=0)

            decoder_input_batch.append(dec_input)
            decoder_target_batch.append(dec_target)

        return (np.array(image_batch), np.array(decoder_input_batch)), np.array(decoder_target_batch)

In [43]:
image_dir = '/kaggle/input/flickr8k/Images/'

asdf = ImageCaptionGenerator(data_pairs,
                             tokenizer,
                             image_dir,
                             batch_size=32,
                             input_size=(224, 224),
                             max_len=max_len)

In [44]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data_pairs, test_size=0.1)

In [45]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data_pairs, test_size=0.1)

In [46]:
len(train_data), len(test_data)

(36409, 4046)

In [47]:
train_data, val_data = train_test_split(train_data, test_size=0.1)

In [48]:
len(train_data), len(val_data)

(32768, 3641)

In [49]:
import tensorflow as tf

mobile_net = tf.keras.applications.MobileNetV2(weights='imagenet')
mobile_net.summary()

In [50]:
mobile_input = mobile_net.layers[0].output
mobile_output = mobile_net.layers[154].output

mobile = Model(inputs=mobile_input, outputs=mobile_output)

mobile.trainable = False

en_train_dense_1 = Dense(514, activation='relu')
en_train_dense_1_out = en_train_dense_1(mobile_output)

en_train_h = Dense(128,activation='tanh')
en_train_h_out = en_train_h(en_train_dense_1_out)

en_train_c = Dense(128,activation='tanh')
en_train_c_out = en_train_c(en_train_dense_1_out)

en_train_dense_2 = Dense(64, activation='relu')
en_train_dense_2_out = en_train_dense_2(en_train_dense_1_out)

en_state = [en_train_h_out, en_train_c_out]

en_reshape = Reshape((1,64))
en_reshape_out = en_reshape(en_train_dense_2_out)

en_model = Model(inputs=mobile_input, outputs=en_reshape_out)

en_model.summary()

In [51]:
de_input = Input(shape=(max_len,))

de_embed = Embedding(input_dim=vocab_size, output_dim=64)
de_embed_out = de_embed(de_input)

de_concat = concatenate([en_reshape_out,de_embed_out], axis=1)

de_lstm = LSTM(units=128, return_sequences=True, return_state=True)
de_lstm_out, de_lstm_h, de_lstm_c = de_lstm(de_concat, initial_state=en_state)

de_dense = Dense(vocab_size)
de_outputs = de_dense(de_lstm_out)

decoder = Model(inputs=[en_reshape_out,en_state,de_input], outputs=de_outputs)
decoder.summary()

In [52]:
img_test_model = Model(inputs=[en_model.input,de_input], outputs=de_outputs)
img_test_model.output

<KerasTensor shape=(None, 41, 9182), dtype=float32, sparse=False, name=keras_tensor_366>

In [53]:
# embedding 레이어에서 mask_zero=True를 넣으면 각 값에 mask정보가 내부적으로 생성됨
# 이럴 경우 Embedding 처리 후 mask정보가 있는 데이터는 concatenate를 진행할 수 없음.
# 때문에 mask_zero 없이 Embedding을 처리 후 loss값 계산에서 pad값은 제외하도록 loss 함수를 재정의함.

PAD_TOKEN = 0

def masked_sparse_categorical_crossentropy(y_true, y_pred):
    """
    y_true: (batch_size, seq_len) 정수 인코딩된 실제 값
    y_pred: (batch_size, seq_len, vocab_size) softmax로 예측된 확률값
    """
    # 마스크 생성 (padding이 아닌 위치는 1.0, padding은 0.0)
    mask = tf.cast(tf.not_equal(y_true, PAD_TOKEN), tf.float32)

    # sparse categorical crossentropy 계산
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

    # 마스크 적용 (패딩 위치의 loss는 0으로 만듦)
    loss *= mask

    # 평균 loss 반환
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [54]:
# accuracy도 마찬가지!
# pad를 제외하고 평가해야한다!

PAD_TOKEN = 0

def masked_accuracy(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int64)
    y_pred_class = tf.argmax(y_pred, axis=-1, output_type=tf.int64)
    mask = tf.cast(tf.not_equal(y_true, PAD_TOKEN), tf.float32)
    matches = tf.cast(tf.equal(y_true, y_pred_class), tf.float32)
    matches *= mask
    return tf.reduce_sum(matches) / tf.reduce_sum(mask)

In [55]:
img_test_model.compile(
    loss=masked_sparse_categorical_crossentropy,
    optimizer='adam',
    metrics=[masked_accuracy])

In [56]:
img_test_model.summary()

In [57]:
train_data_gen = ImageCaptionGenerator(data_pairs=train_data,
                                       tokenizer=tokenizer,
                                       image_dir=image_dir,
                                       batch_size=32,
                                       max_len=max_len)
val_data_gen = ImageCaptionGenerator(data_pairs=val_data,
                                       tokenizer=tokenizer,
                                       image_dir=image_dir,
                                       batch_size=32,
                                       max_len=max_len)
test_data_gen = ImageCaptionGenerator(data_pairs=test_data,
                                       tokenizer=tokenizer,
                                       image_dir=image_dir,
                                       batch_size=32,
                                       max_len=max_len)

In [58]:
hist = img_test_model.fit(train_data_gen,
                          callbacks=[EarlyStopping(monitor='val_loss',patience=5),
                                     ModelCheckpoint(filepath='no_att_no_hc.keras', monitor='val_loss', save_best_only=True, verbose=1)],
                          epochs=200,
                          validation_data=val_data_gen)

Epoch 1/200
[1m  26/1024[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m32:17[0m 2s/step - loss: 8.5853 - masked_accuracy: 0.1068

KeyboardInterrupt: 