In [None]:
import glob
import json
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, TimeDistributed
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [None]:
# Load the data
train_metadata = pd.read_csv('/kaggle/input/asl-fingerspelling/train.csv')
supplemental_metadata = pd.read_csv('/kaggle/input/asl-fingerspelling/supplemental_metadata.csv')

# Load character_to_prediction_index.json
with open('/kaggle/input/asl-fingerspelling/character_to_prediction_index.json', 'r') as f:
    char_to_index = json.load(f)

# Inverse the mapping to get prediction_to_character_map
index_to_char = {v: k for k, v in char_to_index.items()}

# Pad sequences for uniform input length
max_sequence_len = max(
    max([len(seq) for seq in train_metadata['phrase']]), 
    max([len(seq) for seq in supplemental_metadata['phrase']])
)
print(max_sequence_len)

In [None]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(1024, 32, activation='relu', input_shape=(1630, 1)),
    tf.keras.layers.MaxPooling1D(32),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(char_to_index), activation='softmax'))
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
def preprocess_and_train(model, filepath, metadata):
    # Load landmarks
    landmarks = pd.read_parquet(filepath)
    
    # Merge with metadata
    merged = landmarks.merge(metadata, on=["sequence_id"])
    print('  Finish Merge')
    
    # Drop NaN
    merged = merged.dropna()
    print('  Finish Drop NaN')
    
    # Get the labels
    y = merged['phrase']
    x = merged.drop(columns=['phrase', 'participant_id', 'sequence_id', 'path', 'file_id'], errors='ignore')  # drop unnecessary columns
    print('  Finish Label Getting')
    
    # Convert each phrase to a sequence of integers
    y = y.apply(lambda x: [char_to_index[char] for char in list(x)])

    # Pad sequences for uniform input length
    sequence_len = 49
    y = pad_sequences(y, maxlen=sequence_len, padding='post')
    
    # Split into train and validation set
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
    
    # Transform DataFrames into a suitable input format for the model
    x_train = x_train.values.reshape((-1, 1630, 1))
    x_val = x_val.values.reshape((-1, 1630, 1))
    
    # Convert y to one-hot encoding
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=len(char_to_index))
    y_val = tf.keras.utils.to_categorical(y_val, num_classes=len(char_to_index))
    print('  Finish one-hot encoding')
    
#     x_train = x_train.astype('float32')
#     y_train = y_train.astype('float32')
#     x_val = x_val.astype('float32')
#     y_val = y_val.astype('float32')
    
    # Train the model
    for i in range(10):
        model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val), verbose=0)
        loss, acc = model.evaluate(x_val, y_val, verbose=2)
        print(f'  loss: {loss}, acc: {acc}')
    print('  Finish Training')
    
    
    # Delete landmarks and merged dataframes to save memory
    del landmarks
    del merged

In [None]:
# Loop through each file in the train and supplemental directory
for filepath in glob.glob('/kaggle/input/asl-fingerspelling/train_landmarks/*.parquet'):
    print(f'Start {filepath} file')
    preprocess_and_train(model, filepath, train_metadata)

for filepath in glob.glob('/kaggle/input/asl-fingerspelling/supplemental_landmarks/*.parquet'):
    print(f'Start {filepath} file')
    preprocess_and_train(model, filepath, supplemental_metadata)

In [None]:
# Create the converter
converter = tf.lite.TFLiteConverter.from_keras_model(model)

# Set the converter to use SELECT_TF_OPS
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]

# Disable experimental_lower_tensor_list_ops
converter._experimental_lower_tensor_list_ops = False

# Convert the model
tflite_model = converter.convert()

# Save the model
with open('model.tflite', 'wb') as f:
    f.write(tflite_model)

In [None]:
!zip submission.zip  './model.tflite'