### mac gpu 사용가능확인 (1이면 사용가능)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
import tensorflow as tf

In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


# import

In [3]:
# tensorflow.compat.v2 를 사용
import tensorflow.compat.v2 as tf
import pandas as pd
from matplotlib import pyplot as plt
import re
import cv2

# dataset
> 몇십만개의 행의 csv 파일들(몇십GB용량)을 kaggle에서는 주었는데
> git 업로드 문제로 10000개행만 따로 추출한 csv파일을 사용
  
> ram문제로 일단 10개만 추출해서 학습데이터로 사용해보려함

In [4]:
df = pd.read_csv('wiki_small_data.csv', delimiter = ',', usecols=['language','image_url', 'caption_title_and_reference_description'])
df = df[df['language']=='en'] # 일단 영어에 대해서만 진행
p = re.compile('\[SEP\].+')
df['caption_title_and_reference_description'] = df['caption_title_and_reference_description'].apply(
                                lambda x: p.search(x).group().replace('[SEP] ', '') 
                                if p.search(x).group() not in['[SEP] ', ''] else None)
df=df.dropna(axis=0)
print(df.shape)
df.head()

(863, 3)


Unnamed: 0,language,image_url,caption_title_and_reference_description
2,en,https://upload.wikimedia.org/wikipedia/commons...,Downtown Deer Park
5,en,https://upload.wikimedia.org/wikipedia/commons...,"Jürgen Ovens's Justitia, 1663-1665, Museumsber..."
11,en,https://upload.wikimedia.org/wikipedia/commons...,1956 MV Agusta 250 Raid
23,en,https://upload.wikimedia.org/wikipedia/commons...,Seth MacFarlane's logo
38,en,https://upload.wikimedia.org/wikipedia/commons...,Erskine River at Lorne


In [5]:
print(df.iloc[0]['image_url'])

https://upload.wikimedia.org/wikipedia/commons/2/28/Deer_Park_Wisconsin_Downtown_WIS46.jpg


In [6]:
import cv2
import numpy as np
from urllib import request

def url_to_image(url):
    '''
    url 에서 이미지를 추출하여, (512,512,3) 의 rgb ndarray로 리턴
    '''
    resp = request.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype='uint8')
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)/255.0
    return cv2.resize(image, (512,512))

In [7]:
X_train=[]
y_train=[]
for i,j in enumerate((df.iloc[:10]['image_url'])):
    try:
      X_train.append(url_to_image(j))
      y_train.append( df.iloc[i]['caption_title_and_reference_description'] )
    except:
      pass
print(len(X_train))
print(X_train[0].shape)
print(len(y_train))
print(y_train[0])

# png 파일일 경우 libpng경고가 나오는데 무시해도 좋을듯하다.
# srv 파일의 경우 



9
(512, 512, 3)
9
Downtown Deer Park


# 전처리

In [10]:
# y_train 은 캡션 문장인데, 토크나이저를 통해 문장들을 단어별 벡터화 해준다. (cap_vector)

top_k = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&*+.-;?@[]^`{}~ ')
tokenizer.fit_on_texts(y_train)
train_seqs = tokenizer.texts_to_sequences(y_train)
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')
print(len(cap_vector[0]))
cap_vector[3]

19


array([24, 25, 26,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0], dtype=int32)

# 전역변수

In [14]:
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
vocab_size = top_k + 1
num_steps = len(X_train) // BATCH_SIZE
features_shape = 2048
attention_features_shape = 64

# optimizer, loss func..

In [21]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits = True, reduction='none')
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype = loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

# model 정의

In [16]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    '''
    W1, W2, V 는 학습가능한 가중치벡터
    '''
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, features, hidden):
    '''
    feautures : 이미지의 피쳐맵
    hidden : hidden state ( )
    '''
    hidden_with_time_axis = tf.expand_dims(hidden, 1)
    attention_hidden_layer = (tf.nn.tanh(self.W1(features) +
                                         self.W2(hidden_with_time_axis)))
    score = self.V(attention_hidden_layer)
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * features
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

In [17]:
class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.fc = tf.keras.layers.Dense(embedding_dim)
    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [18]:
class RNN_Decoder(tf.keras.Model):
  def __init__(self, embedding_dim, units, vocab_size):
    super(RNN_Decoder, self).__init__()
    self.units = units

    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
    self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc1 = tf.keras.layers.Dense(self.units)
    self.fc2 = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.units)

  def call(self, x, features, hidden):
    context_vector, attention_weights = self.attention(features, hidden)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    x = self.fc1(output)
    x = tf.reshape(x, (-1, x.shape[2]))
    x = self.fc2(x)
    return x, state, attention_weights

  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))

# trian step 정의

In [19]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

Metal device set to: Apple M1


2021-11-28 21:27:42.025145: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-28 21:27:42.026399: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
@tf.function
def train_step(x, y):
    '''
    배치 한번에 대한 학습과정을 커스터마이징
    '''
    with tf.GradientTape() as tape:
        # 1. 모델사용 예측 (prediction)
        predictions = model(x)
        # 2. Loss 계산
        loss = loss_function(y, predictions)
    
    # 3. 그라디언트(gradients) 계산
    gradients = tape.gradient(loss, model.trainable_variables)
    
    # 4. 오차역전파(Backpropagation) - weight 업데이트
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    # loss와 accuracy를 업데이트 합니다.
    train_loss(loss)
    train_acc(y, predictions)