In [1]:
# !pip install s3fs -q
# !pip install tensorflow_hub -q
# !pip install tensorflow_datasets -q
# !pip install pycocotools -q
# !pip install gin-config -q
# !pip install immutabledict -q
# !pip install sentencepiece -q
# !pip install transformers -q
# !pip install -r official/projects/movinet/requirements.txt -q

In [2]:
import configparser

config = configparser.ConfigParser()
config.read('/home/ec2-user/.aws/credentials')

['/home/ec2-user/.aws/credentials']

In [3]:
# AWS credentials and S3 settings
aws_access_key_id = config["default"]['aws_access_key_id']
aws_secret_access_key = config["default"]['aws_secret_access_key']
bucket_name = 'asl-capstone'
s3_URI = 's3://asl-capstone/'

In [4]:
import s3fs
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)

import boto3
s3 = boto3.client('s3',aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_access_key,
                  region_name = 'us-west-2')


In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

tf.keras.backend.clear_session()

2023-10-31 23:19:22.666984: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
%cd /home/ec2-user/models

/home/ec2-user/models


In [7]:
pwd

'/home/ec2-user/models'

In [8]:
model_version = 'a3'

if model_version=='a3':
  max_frames = 120
  image_dims = (256,256)
elif model_version=='a0':
  max_frames = 50
  image_dims = (172,172)
elif model_version=='a5':
  max_frames = 120
  image_dims = (320,302)

In [9]:
# Get the kinetics-600 action labels
KINETICS_URL = "official/projects/movinet/files/kinetics_600_labels.txt"
with open(KINETICS_URL) as obj:
  labels_600 = [line for line in obj.readlines()]
print("Found %d labels." % len(labels_600))

Found 600 labels.


In [10]:
# # Get the kinetics-400 action labels from the GitHub repository.
# from urllib import request  # requires python3
# KINETICS_URL = "https://raw.githubusercontent.com/deepmind/kinetics-i3d/master/data/label_map.txt"
# with request.urlopen(KINETICS_URL) as obj:
#   labels_400 = [line.decode("utf-8").strip() for line in obj.readlines()]
# print("Found %d labels." % len(labels_400))

In [11]:
from official.projects.movinet.modeling import movinet
from official.projects.movinet.modeling import movinet_model

with tf.device("/GPU:0"):
    # Create backbone and model.
    backbone = movinet.Movinet(
        model_id=model_version, #change to correspond to model
        causal=False,
        use_external_states=False,
    )
    model = movinet_model.MovinetClassifier(
        backbone, num_classes=600, output_states=False) #change numclasses for dataset
    
    # Create your example input here.
    # Refer to the paper for recommended input shapes.
    inputs = tf.ones([1, 120, image_dims[0], image_dims[1], 3]) #make sure input shape corresponds to correct model
    
    # Build the model
    model.build(inputs.shape)

2023-10-31 23:19:25.260399: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-31 23:19:25.280525: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-31 23:19:25.281336: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [12]:
# load pretrained checkpoint
checkpoint_dir = f'movinet_{model_version}_base'
checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
status = checkpoint.restore(checkpoint_path)
status.assert_existing_objects_matched()

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f275fe33fd0>

In [13]:
def pad_video_frames(video, n_frames=120):
  if video.shape[0] < n_frames:
    video = np.concatenate([video,np.zeros((n_frames - video.shape[0], 256, 256, 3))])
    video_tf = tf.convert_to_tensor(video)
    return video_tf
  elif video.shape[0] > n_frames:
    video = video[np.round(np.linspace(0, len(video)-1, num=n_frames)).astype(int)]
    video_tf = tf.convert_to_tensor(video)
    return video_tf
  else:
    return tf.convert_to_tensor(video)


In [14]:
video_files = pd.read_parquet("s3://asl-capstone/youtube-asl/test_sample/numpy_files/RGB/masterfile.parquet")
video_files.head(10)

Unnamed: 0,caption,path
0,Welcome to the third round,s3://asl-capstone/youtube-asl/test_sample/nump...
1,Welcome to the third round,s3://asl-capstone/youtube-asl/test_sample/nump...
2,of the Pearls announcement.,s3://asl-capstone/youtube-asl/test_sample/nump...
3,Today's category is called the,s3://asl-capstone/youtube-asl/test_sample/nump...
4,Hidden Pearls. What is it?,s3://asl-capstone/youtube-asl/test_sample/nump...
5,"Hidden Pearls...in other words,",s3://asl-capstone/youtube-asl/test_sample/nump...
6,they are known as the dark horses.,s3://asl-capstone/youtube-asl/test_sample/nump...
7,"Every now and then, they are seldom recognized",s3://asl-capstone/youtube-asl/test_sample/nump...
8,for their work or performance.,s3://asl-capstone/youtube-asl/test_sample/nump...
9,"Additionally, they are behind a big leader",s3://asl-capstone/youtube-asl/test_sample/nump...


In [15]:
# def data_generator(file_paths_df, n_frames=120):
#     for _, row in file_paths_df.iterrows():
#         with fs.open(row['path'], 'rb') as f:
#             video = np.load(f)

#         video = pad_video_frames(video, n_frames=n_frames)
#         yield video, row['caption']


# # Create TensorFlow Dataset for model input. 
# train_dataset = tf.data.Dataset.from_generator(
#     generator=lambda: data_generator(video_files, n_frames=120),
#     output_signature=(
#         tf.TensorSpec(shape=(120, 256, 256, 3), dtype=tf.float32),
#         tf.TensorSpec(shape=(), dtype=tf.string)
#     )
# )

In [20]:
with tf.device("/GPU:0"):
    videos = np.empty((1,120,256,256,3))
    captions = []
    
    for _, row in tqdm(video_files.head(30).iterrows()):
        with fs.open(row['path'], 'rb') as f:
            video = np.load(f)
    
        video = pad_video_frames(video, n_frames=120)
        video = tf.expand_dims(video, axis=0)
        videos = tf.concat([videos, video], axis=0)
        captions += [row['caption']]
    
    videos = videos[1:,:,:,:,:]

18it [05:30, 19.37s/it]2023-10-31 23:37:57.663286: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.52GiB (rounded to 3774873600)requested by op ConcatV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-10-31 23:37:57.663337: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2023-10-31 23:37:57.663346: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 173, Chunks in use: 172. 43.2KiB allocated for chunks. 43.0KiB in use in bin. 17.6KiB client-requested in use in bin.
2023-10-31 23:37:57.663353: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 184, Chunks in use: 183. 117.8KiB allocated for chunks. 117.2KiB in use in bin. 91.3KiB client-requested in use in bin.
2023-10-31 23:37:57.663

ResourceExhaustedError: {{function_node __wrapped__ConcatV2_N_2_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[20,120,256,256,3] and type double on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:ConcatV2] name: concat

In [22]:
print(videos.shape)
print(len(captions))

(18, 120, 256, 256, 3)
18


In [23]:
# Run the model prediction.
with tf.device("/GPU:0"):
    output = model(videos)
    prediction = tf.argmax(output, -1)
    
print(labels_600[prediction.numpy()[0]])
print(labels_600[prediction.numpy()[1]])

2023-10-31 23:38:33.413246: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.12GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2023-10-31 23:38:33.414138: W tensorflow/core/framework/op_kernel.cc:1828] OP_REQUIRES failed at xla_ops.cc:503 : UNKNOWN: Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.2 = (f32[18,16,120,128,128]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[18,3,120,257,257]{4,3,2,1,0} %transpose, f32[16,3,1,3,3]{4,3,2,1,0} %transpose.1), window={size=1x3x3 stride=1x2x2}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convForward", metadata={op_type="Conv3D" op_name="Conv3D" source_file="/opt/tensorflow/lib/python3.10/site-packages/keras/src/layers/convolutional/base_conv.py" source_line=262}, backend_config="{\"conv_result_scale\":1,\"activation_mode\":\"0\",\"side_input_sca

UnknownError: Exception encountered when calling layer 'conv3d' (type Conv3D).

Failed to determine best cudnn convolution algorithm for:
%cudnn-conv.2 = (f32[18,16,120,128,128]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[18,3,120,257,257]{4,3,2,1,0} %transpose, f32[16,3,1,3,3]{4,3,2,1,0} %transpose.1), window={size=1x3x3 stride=1x2x2}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convForward", metadata={op_type="Conv3D" op_name="Conv3D" source_file="/opt/tensorflow/lib/python3.10/site-packages/keras/src/layers/convolutional/base_conv.py" source_line=262}, backend_config="{\"conv_result_scale\":1,\"activation_mode\":\"0\",\"side_input_scale\":0}"

Original error: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 2281701376 bytes.

To ignore this failure and try to use a fallback algorithm (which may have suboptimal performance), use XLA_FLAGS=--xla_gpu_strict_conv_algorithm_picker=false.  Please also file a bug for the root cause of failing autotuning. [Op:__inference_call_21536]

Call arguments received by layer 'conv3d' (type Conv3D):
  • inputs=tf.Tensor(shape=(18, 120, 256, 256, 3), dtype=float32)

In [None]:
new_model = model.backbone

In [None]:
movinet_output = new_model(videos)


In [None]:
print(movinet_output[0]['block4_layer3'].shape)
print(movinet_output[1]['state_block4_layer3_pool_buffer'].shape)
print(movinet_output[1]['state_block4_layer3_pool_frame_count'].shape)
vid_embedding = movinet_output[0]['block4_layer3']

In [None]:
# tf.keras.layers.GlobalAveragePooling3D(keepdims=True)(vid_embedding)
# tf.keras.layers.Flatten()(vid_embedding)

vid_embedding_flatten = tf.keras.layers.Reshape((vid_embedding.get_shape()[1], 8 * 8 * 168), name='flatten')(vid_embedding)

vid_embedding_flatten = tf.keras.layers.Dense(1024, activation='linear', name='hidden_layer')(vid_embedding_flatten)

# vid_embedding_flatten = tf.keras.layers.Normalization(axis=2)(vid_embedding_flatten)

print(vid_embedding_flatten.shape)

inputs = [batch, frames, height, width, channels]

vid_embeddings = model_backbone(inputs)[0]['block4_layer3']

hidden = tf.keras.layer(vid_embeddings)

dropout

transformer

decoder

In [None]:
# Load model directly
from transformers import AutoTokenizer, TFT5ForConditionalGeneration, T5EncoderModel

with tf.device("/GPU:0"):
    tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-large")
    model = TFT5ForConditionalGeneration.from_pretrained("google/t5-v1_1-large")
    # encoder_model = T5EncoderModel.from_pretrained("google/t5-v1_1-large")

# tokenizer = AutoTokenizer.from_pretrained("t5-large")
# model = TFT5ForConditionalGeneration.from_pretrained("t5-large")
# encoder_model = T5EncoderModel.from_pretrained("t5-large")



In [None]:
# # training
# inputs = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="tf").input_ids
# labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="tf").input_ids
# outputs = model(inputs, labels=labels)
# loss = outputs.loss
# logits = outputs.logits
# print(loss)
# print(logits)

# # inference
# inputs = tokenizer(
#    [ "studies have shown that owning a cat is good for you", "i have always wanted to have a pet"], return_tensors="pt", padding=True
# ).input_ids  # Batch size 1

# # Convert from tokens to embeddings
# encoder_outputs = encoder_model(input_ids=inputs)
# encoder_outputs['last_hidden_state'] = encoder_outputs['last_hidden_state'].detach().numpy()
# print(encoder_outputs['last_hidden_state'].shape)

# # Generate text from embeddings
# outputs = model.generate(encoder_outputs=encoder_outputs)
# # outputs = model.generate(inputs)


# print("Response: \n")
# print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
# # studies have shown that owning a dog is good for you

In [None]:
from transformers.modeling_tf_outputs import TFBaseModelOutputWithPastAndCrossAttentions

In [None]:
# training
input_mask = np.ones((2,100,1024))
encoder_outputs = TFBaseModelOutputWithPastAndCrossAttentions(last_hidden_state=vid_embedding_flatten)


labels = tokenizer(captions, return_tensors="tf", padding=True).input_ids
outputs = model(input_ids=None, encoder_outputs=encoder_outputs, labels=labels)
print(outputs.loss)
# loss = outputs.loss
# logits = outputs.logits
# print(loss)
# print(logits)

In [None]:
# model.generate(encoder_outputs=encoder_outputs)
tokenizer.decode(model.generate(encoder_outputs=encoder_outputs)[0], skip_special_tokens=True)

In [None]:
def movinet_T5_model(video_base_model, language_base_model,
                      max_sequence_length=120,
                      hidden_size = 1024,
                      dropout=0.1,
                      learning_rate=0.00005):
    """
    Builds a translation model that accepts videos, creates video embeddings using a MoViNet model,
    and then passes the video embeddings into an encoder-decoder to generate text translation
    """

    # Set all models to trainable
    video_base_model.trainable = True
    language_base_model.trainable = True

    # Set input structure for video inputs
    video_inputs = tf.keras.layers.Input(shape=(max_sequence_length, 256, 256, 3), dtype=tf.int64, name='input_layer')
    # token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    # Get output embeddings from video model
    movinet_out = video_base_model(video_inputs)
    vid_embedding = movinet_out[0]['block4_layer3']

    # Layer to flatten w x h x c dimensions
    vid_embedding_flatten = tf.keras.layers.Reshape((120, 8 * 8 * 168), name='flatten')(vid_embedding)

    # Hidden layer to force embeddings into correct shape for language model
    vid_embedding_flatten = tf.keras.layers.Dense(1024, activation='linear', name='hidden_layer')(vid_embedding_flatten)

    # # Pass embeddings into T5 language model
    # language_output = language_base_model.generate(encoder_outputs=encoder_outputs, attention_mask=attention_mask)

    # classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

    # classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])

    # classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    #                              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    #                              metrics='accuracy')



    ### END YOUR CODE

    model = tf.keras.Model(inputs=inputs, outputs=vid_embedding_flatten)

    return model