In [1]:
# !pip install s3fs -q
# !pip install tensorflow_hub -q
# !pip install tensorflow_datasets -q
# !pip install pycocotools -q
# !pip install gin-config -q
# !pip install immutabledict -q
# !pip install sentencepiece -q
# !pip install transformers -q
# !pip install -r official/projects/movinet/requirements.txt -q

In [2]:
import configparser

config = configparser.ConfigParser()
config.read('/home/ec2-user/.aws/credentials')

['/home/ec2-user/.aws/credentials']

In [3]:
# AWS credentials and S3 settings
aws_access_key_id = config["default"]['aws_access_key_id']
aws_secret_access_key = config["default"]['aws_secret_access_key']
bucket_name = 'asl-capstone'
s3_URI = 's3://asl-capstone/'

In [17]:
import s3fs
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)

import boto3
s3 = boto3.client('s3',aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_access_key,
                  region_name = 'us-west-2')


In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import cv2
import numpy as np
import pandas as pd

2023-10-31 18:50:24.071960: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
%cd /home/ec2-user/models

/home/ec2-user/models


In [7]:
pwd

'/home/ec2-user/models'

In [8]:
model_version = 'a3'

if model_version=='a3':
  max_frames = 120
  image_dims = (256,256)
elif model_version=='a0':
  max_frames = 50
  image_dims = (172,172)
elif model_version=='a5':
  max_frames = 120
  image_dims = (320,302)

In [9]:
# Get the kinetics-600 action labels
KINETICS_URL = "official/projects/movinet/files/kinetics_600_labels.txt"
with open(KINETICS_URL) as obj:
  labels_600 = [line for line in obj.readlines()]
print("Found %d labels." % len(labels_600))

Found 600 labels.


In [10]:
# # Get the kinetics-400 action labels from the GitHub repository.
# from urllib import request  # requires python3
# KINETICS_URL = "https://raw.githubusercontent.com/deepmind/kinetics-i3d/master/data/label_map.txt"
# with request.urlopen(KINETICS_URL) as obj:
#   labels_400 = [line.decode("utf-8").strip() for line in obj.readlines()]
# print("Found %d labels." % len(labels_400))

In [11]:
from official.projects.movinet.modeling import movinet
from official.projects.movinet.modeling import movinet_model

# Create backbone and model.
backbone = movinet.Movinet(
    model_id=model_version, #change to correspond to model
    causal=False,
    use_external_states=False,
)
model = movinet_model.MovinetClassifier(
    backbone, num_classes=600, output_states=False) #change numclasses for dataset

# Create your example input here.
# Refer to the paper for recommended input shapes.
inputs = tf.ones([1, 120, image_dims[0], image_dims[1], 3]) #make sure input shape corresponds to correct model

# Build the model
model.build(inputs.shape)

2023-10-31 18:50:29.611929: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-31 18:50:29.633334: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-31 18:50:29.635267: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [12]:
# load pretrained checkpoint
checkpoint_dir = f'movinet_{model_version}_base'
checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
status = checkpoint.restore(checkpoint_path)
status.assert_existing_objects_matched()

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7ff7cd9fffd0>

In [63]:
def pad_video_frames(video, n_frames=120):
  if video.shape[0] < n_frames:
    video = np.concatenate([video,np.zeros((n_frames - video.shape[0], 256, 256, 3))])
    video = np.expand_dims(video, axis=0)
    video_tf = tf.convert_to_tensor(video)
    return video_tf
  elif video.shape[0] > n_frames:
    video = video[np.round(np.linspace(0, len(video)-1, num=n_frames)).astype(int)]
    video = np.expand_dims(video, axis=0)
    video_tf = tf.convert_to_tensor(video)
    return video_tf
  else:
    video = np.expand_dims(video, axis=0)
    return tf.convert_to_tensor(video)


In [66]:
# fs.s3_dir_ls("s3://asl-capstone/youtube-asl/test_sample/numpy_files/RGB/")
# fs.ls("s3://asl-capstone/youtube-asl/test_sample/numpy_files/RGB/")

# List files in the S3 bucket
response = s3.list_objects_v2(Bucket=bucket_name, Prefix="youtube-asl/test_sample/numpy_files/RGB/")
video_files = []
caption_files = []
for content in response.get('Contents', []):
    if "_cap.npy" in content['Key']:
        caption_files.append(content['Key'])
    else:
        video_files.append(content['Key'])

print(len(video_files))
print(len(caption_files))
video_files[:3]

63
63


['youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_0.npy',
 'youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_1.npy',
 'youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_10.npy']

In [67]:
vids = pd.DataFrame(video_files,columns=['video'])
video_files = pd.DataFrame(caption_files, columns=['caption'])
video_files['video'] = video_files.caption.str.replace('_cap.','.')

video_files = video_files.merge(vids, on='video')
video_files.shape

(63, 2)

In [68]:
videos = np.empty((1,120,256,256,3))
captions = np.empty((1))

for _, row in video_files.iterrows():
    print(row['video'])
    with fs.open(f"{s3_URI + row['video']}", 'rb') as f:
        video = np.load(f)
    with fs.open(s3_URI + row['caption'], 'rb') as f:
        caption = np.load(f)
        caption = np.expand_dims(caption, axis=0)

    # Pad with additional frames
    video = pad_video_frames(video, n_frames=120)
    videos = tf.concat([videos, video], axis=0)
    captions = np.concatenate([captions, caption], axis=0)
    

youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_0.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_10.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_11.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_12.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_13.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_14.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_15.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_16.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_17.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_18.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_19.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_1.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_20.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_21.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_22.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_23.npy
youtube-asl/test_sample/numpy_files/RGB/06kKvQp4SfM_24.npy

2023-10-31 20:11:49.374649: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 5.45GiB (rounded to 5851054080)requested by op ConcatV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-10-31 20:11:49.374691: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2023-10-31 20:11:49.374700: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 173, Chunks in use: 172. 43.2KiB allocated for chunks. 43.0KiB in use in bin. 17.6KiB client-requested in use in bin.
2023-10-31 20:11:49.374707: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 184, Chunks in use: 183. 117.8KiB allocated for chunks. 117.2KiB in use in bin. 91.3KiB client-requested in use in bin.
2023-10-31 20:11:49.374713: I tensorflow/tsl/f

ResourceExhaustedError: {{function_node __wrapped__ConcatV2_N_2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[31,120,256,256,3] and type double on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:ConcatV2] name: concat

In [69]:
videos.shape

TensorShape([30, 120, 256, 256, 3])

In [70]:
captions.shape

(30,)

In [71]:
# Run the model prediction.
output = model(videos)
prediction = tf.argmax(output, -1)

print(labels_600[prediction.numpy()[0]])
print(labels_600[prediction.numpy()[1]])

2023-10-31 20:20:41.522276: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x2b2843b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-10-31 20:20:41.522311: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A10G, Compute Capability 8.6
2023-10-31 20:23:00.189429: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8903
2023-10-31 20:24:56.865300: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.53GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2023-10-31 20:24:56.866410: W tensorflow/core/framework/op_kernel.cc:1828] OP_REQUIRES failed at xla_ops.cc:503 : UNKNOWN: Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.2 = (f32[30,16,120,128,128]{4,3,2,1,0}, u8[0]{0}) cust

UnknownError: Exception encountered when calling layer 'conv3d' (type Conv3D).

Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.2 = (f32[30,16,120,128,128]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[30,3,120,257,257]{4,3,2,1,0} %transpose, f32[16,3,1,3,3]{4,3,2,1,0} %transpose.1), window={size=1x3x3 stride=1x2x2}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convForward", metadata={op_type="Conv3D" op_name="Conv3D" source_file="/opt/tensorflow/lib/python3.10/site-packages/keras/src/layers/convolutional/base_conv.py" source_line=262}, backend_config="{\"conv_result_scale\":1,\"activation_mode\":\"0\",\"side_input_scale\":0}"

Original error: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 3791650816 bytes.

To ignore this failure and try to use a fallback algorithm (which may have suboptimal performance), use XLA_FLAGS=--xla_gpu_strict_conv_algorithm_picker=false.  Please also file a bug for the root cause of failing autotuning. [Op:__inference_call_21502]

Call arguments received by layer 'conv3d' (type Conv3D):
  • inputs=tf.Tensor(shape=(30, 120, 256, 256, 3), dtype=float32)

In [25]:
new_model = model.backbone

In [26]:
movinet_output = new_model(input_frames)


In [27]:
print(movinet_output[0]['block4_layer3'].shape)
print(movinet_output[1]['state_block4_layer3_pool_buffer'].shape)
print(movinet_output[1]['state_block4_layer3_pool_frame_count'].shape)
vid_embedding = movinet_output[0]['block4_layer3']

(2, 120, 8, 8, 168)
(2, 1, 1, 1, 560)
(1,)


In [28]:
# tf.keras.layers.GlobalAveragePooling3D(keepdims=True)(vid_embedding)
# tf.keras.layers.Flatten()(vid_embedding)

vid_embedding_flatten = tf.keras.layers.Reshape((vid_embedding.get_shape()[1], 8 * 8 * 168), name='flatten')(vid_embedding)

vid_embedding_flatten = tf.keras.layers.Dense(1024, activation='linear', name='hidden_layer')(vid_embedding_flatten)

# vid_embedding_flatten = tf.keras.layers.Normalization(axis=2)(vid_embedding_flatten)

print(vid_embedding_flatten.shape)
vid_embedding_flatten

(2, 120, 1024)


<tf.Tensor: shape=(2, 120, 1024), dtype=float32, numpy=
array([[[ 18.831732  ,   7.0215735 ,   4.0451646 , ..., -15.802565  ,
          13.811161  ,  -1.8661177 ],
        [ 26.184258  ,  -3.1906955 ,  11.112967  , ...,  -4.7791815 ,
           2.924333  ,  -4.6001205 ],
        [ 23.950943  ,  -3.2421694 ,   5.1704926 , ...,  -3.6985006 ,
           2.6690412 ,  -3.1391838 ],
        ...,
        [  0.06778887,   8.352831  ,   6.460089  , ...,  -0.09141731,
           8.335562  ,  -1.9756447 ],
        [ -3.0825946 ,   2.545278  ,   5.3816032 , ...,   0.17695141,
           9.406091  ,  -2.2555678 ],
        [ -0.72194326,   2.9288397 ,   0.589802  , ...,  -0.9653435 ,
           8.998444  ,  -2.1913207 ]],

       [[ 12.510971  ,   3.0881128 ,  -7.0710526 , ..., -31.937641  ,
           8.6349535 ,  -9.268247  ],
        [  2.7310724 ,   4.8743258 ,   5.0151854 , ..., -19.641161  ,
          -1.5725746 ,  -7.6036153 ],
        [ -5.8343563 ,  11.042307  ,   8.24586   , ..., -22.13706

inputs = [batch, frames, height, width, channels]

vid_embeddings = model_backbone(inputs)[0]['block4_layer3']

hidden = tf.keras.layer(vid_embeddings)

dropout

transformer

decoder

In [29]:
# Load model directly
from transformers import AutoTokenizer, TFT5ForConditionalGeneration, T5EncoderModel

tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-large")
model = TFT5ForConditionalGeneration.from_pretrained("google/t5-v1_1-large")
encoder_model = T5EncoderModel.from_pretrained("google/t5-v1_1-large")

# tokenizer = AutoTokenizer.from_pretrained("t5-large")
# model = TFT5ForConditionalGeneration.from_pretrained("t5-large")
# encoder_model = T5EncoderModel.from_pretrained("t5-large")



Downloading (…)okenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading tf_model.h5:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at google/t5-v1_1-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

In [30]:
# training
inputs = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="tf").input_ids
labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="tf").input_ids
outputs = model(inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits
print(loss)
print(logits)

# inference
inputs = tokenizer(
   [ "studies have shown that owning a cat is good for you", "i have always wanted to have a pet"], return_tensors="pt", padding=True
).input_ids  # Batch size 1

# Convert from tokens to embeddings
encoder_outputs = encoder_model(input_ids=inputs)
encoder_outputs['last_hidden_state'] = encoder_outputs['last_hidden_state'].detach().numpy()
print(encoder_outputs['last_hidden_state'].shape)

# Generate text from embeddings
outputs = model.generate(encoder_outputs=encoder_outputs)
# outputs = model.generate(inputs)


print("Response: \n")
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
# studies have shown that owning a dog is good for you

tf.Tensor([9.368338], shape=(1,), dtype=float32)
tf.Tensor(
[[[-62.19115   -32.45819   -32.661156  ... -62.31196   -58.607594
   -63.145653 ]
  [-30.042046  -15.550552   -5.4085193 ... -29.970148  -29.662457
   -30.94093  ]
  [-34.05693   -20.780964  -16.395683  ... -34.25679   -33.479412
   -35.364624 ]
  ...
  [-64.51187   -38.57645   -32.98413   ... -65.04996   -62.914642
   -66.507034 ]
  [-36.099792  -17.778969   -6.017308  ... -36.316704  -35.697544
   -37.58899  ]
  [-33.98794   -18.195509   -5.7814636 ... -33.936363  -33.53041
   -35.222206 ]]], shape=(1, 9, 32128), dtype=float32)
(2, 14, 1024)




Response: 

['. Whether you own a cat or not, owning a cat is good', '.. i have always wanted to have a pet. there is.']


In [38]:
# training
input_mask = np.ones((2,100,1024))
encoder_outputs['last_hidden_state'] = vid_embedding_flatten
labels = tokenizer(input_labels, return_tensors="tf", padding=True).input_ids
outputs = model(input_ids=None, encoder_outputs=encoder_outputs, labels=labels)
# loss = outputs.loss
# logits = outputs.logits
# print(loss)
# print(logits)

tf.Tensor([86.131775], shape=(1,), dtype=float32)
tf.Tensor(
[[[ -5.3645763    3.5970078    7.7293634  ...  -7.8763943   -6.8990536
    -5.5075016 ]
  [ 14.984283    19.030575    38.70169    ...  14.579574    12.502595
    16.92012   ]
  [-17.103842    15.907584    10.001471   ... -16.703442   -17.037233
   -16.967785  ]
  ...
  [ -5.8787007   11.533233    15.281948   ...  -6.087396    -8.442596
    -6.816762  ]
  [ -1.903471    15.419356     5.2927923  ...  -4.3387985   -5.1055126
    -4.126581  ]
  [  1.2768021    5.332037     5.120802   ...  -0.39638042   0.46579933
     0.45233345]]

 [[-33.087738     6.5069637   22.882313   ... -30.351261   -32.48015
   -34.44802   ]
  [ 17.386831    21.30508     53.662704   ...  18.88051     17.317268
    17.776508  ]
  [-87.63423    -38.711914   -23.768002   ... -84.98605    -84.22067
   -87.87905   ]
  ...
  [-30.30663    -11.790724    21.064756   ... -26.022137   -28.894558
   -30.314651  ]
  [  1.2640057   18.883102    41.192955   ...   4.690

In [40]:
# model.generate(encoder_outputs=encoder_outputs)
tokenizer.decode(model.generate(encoder_outputs=encoder_outputs)[0], skip_special_tokens=True)

'<pad> stabili bewegt chiarCHE advert bewegt colour poresänder Vertreter bewegt stabili Schönheit Schönheitégal colours bewegt colours bewegt'

In [None]:
def movinet_T5_model(video_base_model, language_base_model,
                      max_sequence_length=120,
                      hidden_size = 1024,
                      dropout=0.1,
                      learning_rate=0.00005):
    """
    Builds a translation model that accepts videos, creates video embeddings using a MoViNet model,
    and then passes the video embeddings into an encoder-decoder to generate text translation
    """

    # Set all models to trainable
    video_base_model.trainable = True
    language_base_model.trainable = True

    # Set input structure for video inputs
    video_inputs = tf.keras.layers.Input(shape=(max_sequence_length, 256, 256, 3), dtype=tf.int64, name='input_layer')
    # token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    # Get output embeddings from video model
    movinet_out = video_base_model(video_inputs)
    vid_embedding = movinet_out[0]['block4_layer3']

    # Layer to flatten w x h x c dimensions
    vid_embedding_flatten = tf.keras.layers.Reshape((120, 8 * 8 * 168), name='flatten')(vid_embedding)

    # Hidden layer to force embeddings into correct shape for language model
    vid_embedding_flatten = tf.keras.layers.Dense(1024, activation='linear', name='hidden_layer')(vid_embedding_flatten)

    # # Pass embeddings into T5 language model
    # language_output = language_base_model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)

    # classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

    # classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])

    # classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    #                              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    #                              metrics='accuracy')



    ### END YOUR CODE

    model = tf.keras.Model(inputs=inputs, outputs=vid_embedding_flatten)

    return model