In [1]:
import torch
import pickle
from transformers import MCTCForCTC, MCTCConfig

config = MCTCConfig()

model = MCTCForCTC(config)
model.save_pretrained(save_directory="./")
model_dict = torch.load("pytorch_model.bin")


weights_dict = pickle.load(open("./weights_dict.pkl", "rb"))

conv_mapper = {
    "normw": 'mctc.encoder.layer_norm.singleton_weight',
    "normb": 'mctc.encoder.layer_norm.singleton_bias',
    "filter": 'mctc.encoder.conv.conv_layers.0.weight',
    "bias": 'mctc.encoder.conv.conv_layers.0.bias'
}
mapper = {
    'pos': 'attention.self.distance_embedding.weight',
    'w1': 'intermediate.dense.weight',
    'wq': 'attention.self.query.weight',
    'wk': 'attention.self.key.weight',
    'wv': 'attention.self.value.weight',
    'wf': 'attention.output.dense.weight',
    'w2': 'output.dense.weight',
    'norm1w': 'attention.output.LayerNorm.weight',
    'norm1b': 'attention.output.LayerNorm.bias',
    'norm2w': 'output.LayerNorm.weight',
    'norm2b': 'output.LayerNorm.bias',
}
ctc_mapper = {
    "_CTC_head_w": 'ctc_head.weight',
    "_CTC_head_b": "ctc_head.bias"
}

def fl_key_to_model_key(idx, fl_key):
    if idx < 4:
        model_key_mapped = conv_mapper[fl_key.split("_")[-1]]
    elif idx < 400:
        trf_idx = fl_key.split("_")[1]
        trf_key = fl_key.split("_")[2]
        mapped = mapper[trf_key]
        model_key_mapped = f'mctc.encoder.layers.{trf_idx}.{mapped}'
    else:
        model_key_mapped = ctc_mapper[fl_key]

    return model_key_mapped;

def af_fix_then_tensor(param_name, af_array):
    tensor = torch.Tensor(af_array)
    if param_name == "Conv2D_filter":
        '''
        Conv2D_filter -> mctc.encoder.conv.conv_layers.0.weight
        ArrayFire numpy: (7, 1, 80, 3072)
        Model Tensor:    torch.Size([3072, 80, 7])
        '''
        tensor = tensor.squeeze(1) # (7, 80, 3072)
        tensor = tensor.transpose(0, 2)
        assert tensor.shape == (3072, 80, 7)
    if param_name == "Conv2D_bias":
        '''
        Conv2D_bias -> mctc.encoder.conv.conv_layers.0.bias
        ArrayFire numpy: (1, 1, 3072)
        Model Tensor:    torch.Size([3072])
        '''
        tensor = tensor.reshape((3072))
        assert tensor.shape == (3072,)
        
    # if "wf" in param_name:
    #     tensor = tensor.transpose(0,1)
    
    # if "norm" in param_name and "Conv2D" in param_name:
    #     '''
    #     Conv2D_normw -> mctc.encoder.layer_norm.weight
    #     ArrayFire numpy: (1,)
    #     Model Tensor:    torch.Size([80])

    #     Conv2D_normb -> mctc.encoder.layer_norm.bias
    #     ArrayFire numpy: (1,)
    #     Model Tensor:    torch.Size([80])
    #     '''
    #     tensor = tensor.tile((80,))
    #     assert tensor.shape == (80,)

    if "norm" in param_name and "Conv2D" not in param_name:
        '''
        Conv2D_normw -> mctc.encoder.layer_norm.weight
        ArrayFire numpy: (1,)
        Model Tensor:    torch.Size([1536])

        Conv2D_normb -> mctc.encoder.layer_norm.bias
        ArrayFire numpy: (1,)
        Model Tensor:    torch.Size([1536])
        '''
        tensor = tensor.tile((1536,))
        assert tensor.shape == (1536,)
    
    return tensor
    


In [2]:
for idx, fl_key in enumerate(weights_dict.keys()):
    if "LID" in fl_key:
        break

    model_key_mapped = fl_key_to_model_key(idx, fl_key)

    print(fl_key, "->", model_key_mapped)
    orig_tensor = model_dict[model_key_mapped]
    fl_tensor = weights_dict[fl_key]
    fixed_tensor = af_fix_then_tensor(fl_key, fl_tensor)
    assert fixed_tensor.shape == orig_tensor.shape
    print("ArrayFire numpy:         ", fl_tensor.shape)
    print("ArrayFire Tensor(fixed): ", fixed_tensor.shape)
    print("Model Tensor:            ", orig_tensor.shape)
    print()

# testing above mapper with tensor shapes
# testing above mapper with tensor shapes
model_dict_keys = list(model_dict.keys())
mapped_keys = []
for idx, fl_key in enumerate(weights_dict.keys()):
    if "LID" in fl_key:
        break

    model_key_mapped = fl_key_to_model_key(idx, fl_key)


    fl_tensor = weights_dict[fl_key]
    fixed_tensor = af_fix_then_tensor(fl_key, fl_tensor)

    model_dict[model_key_mapped] = fixed_tensor
    
    mapped_keys.append(model_key_mapped)
# confirming we mapped everything -> and <- 
assert len(mapped_keys) == len(model_dict_keys)
assert set(mapped_keys) == set(model_dict_keys)
torch.save(model_dict, "ported_pytorch_model.bin")

Conv2D_normw -> mctc.encoder.layer_norm.singleton_weight
ArrayFire numpy:          (1,)
ArrayFire Tensor(fixed):  torch.Size([1])
Model Tensor:             torch.Size([1])

Conv2D_normb -> mctc.encoder.layer_norm.singleton_bias
ArrayFire numpy:          (1,)
ArrayFire Tensor(fixed):  torch.Size([1])
Model Tensor:             torch.Size([1])

Conv2D_filter -> mctc.encoder.conv.conv_layers.0.weight
ArrayFire numpy:          (7, 1, 80, 3072)
ArrayFire Tensor(fixed):  torch.Size([3072, 80, 7])
Model Tensor:             torch.Size([3072, 80, 7])

Conv2D_bias -> mctc.encoder.conv.conv_layers.0.bias
ArrayFire numpy:          (1, 1, 3072)
ArrayFire Tensor(fixed):  torch.Size([3072])
Model Tensor:             torch.Size([3072])

TRF_0_pos_emb -> mctc.encoder.layers.0.attention.self.distance_embedding.weight
ArrayFire numpy:          (1839, 384)
ArrayFire Tensor(fixed):  torch.Size([1839, 384])
Model Tensor:             torch.Size([1839, 384])

TRF_0_w1_ -> mctc.encoder.layers.0.intermediate.den

In [9]:
import arrayfire as af

# in order
keys = [
    "model_input",
    "conv_output", "misc_padMask",
] 
for i in range(36):
    keys.append(f"trf_input_{i}")
    keys.append(f"trf_output_{i}")

keys = keys + [
    "ctc_head_input",
    "ctc_head_output"
]
    
lookup = {}
for key in keys:
    arr = af.array.read_array("OUTPUT.arr", key=key).to_ndarray()
    lookup[key] = arr


In [10]:
model_input = lookup["model_input"]
print("model_input", model_input.shape)
model_input = model_input.tolist()
for item in model_input[:5] + model_input[400:405] + model_input[900:905]:
    print(item[:3])


model_input (961, 80)
[-2.0350985527038574, -2.0350985527038574, -2.0350985527038574]
[-2.0350985527038574, -2.0350985527038574, -2.0350985527038574]
[-2.0350985527038574, -2.0350985527038574, -2.0350985527038574]
[-2.0350985527038574, -2.0350985527038574, -2.0350985527038574]
[-2.0350985527038574, -2.0350985527038574, -2.0350985527038574]
[-0.8471124172210693, -0.38888654112815857, -0.11196883767843246]
[-0.6358304619789124, -0.6489655375480652, -0.6837743520736694]
[-0.2124035656452179, -0.21033309400081635, -0.21287468075752258]
[-0.7262757420539856, -0.5959850549697876, -0.4422873258590698]
[-0.5001829266548157, -0.5467087030410767, -0.6760406494140625]
[-0.3086141049861908, -0.2965729832649231, -0.28046396374702454]
[-0.4022613763809204, -0.45817720890045166, -0.6230398416519165]
[-0.3147038519382477, -0.23073029518127441, -0.11785335838794708]
[-0.3470221757888794, -0.28721654415130615, -0.2011248767375946]
[-0.8904579281806946, -0.53175288438797, -0.27561914920806885]


In [25]:
from transformers import MCTCForCTC, MCTCProcessor

model = MCTCForCTC.from_pretrained("cwkeam/mctc-large")
processor = MCTCProcessor.from_pretrained("cwkeam/mctc-large")

Downloading:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
from datasets import load_dataset, load_metric, Audio, Dataset

my_dataset = Dataset.from_dict({"audio": ["./audio/audio.flac"]})
audio_sample = my_dataset.cast_column("audio", Audio(sampling_rate=16_000))
audio_arrays = [x["array"] for x in audio_sample["audio"]]
inputs = processor(
    audio_arrays, sampling_rate=16_000, max_length=16000, truncation=True
)
hf_inputs = torch.Tensor(inputs["input_features"])

In [35]:
import torch
real_inputs = torch.Tensor(inputs["input_features"])
model_input = torch.Tensor(model_input)
print(model_input.shape)
print(real_inputs.shape)


torch.Size([961, 80])
torch.Size([1, 15401, 1])


  real_inputs = torch.Tensor(inputs["input_features"])


In [21]:
import torch
from transformers import MCTCForCTC, MCTCProcessor
from datasets import load_dataset, load_metric, Audio, Dataset

my_dataset = Dataset.from_dict({"audio": ["./audio/audio.flac"]})
audio_sample = my_dataset.cast_column("audio", Audio(sampling_rate=16_000))

# model = MCTCForCTC.from_pretrained("cwkeam/mctc-large")
processor = MCTCProcessor.from_pretrained("cwkeam/mctc-large")
print("feature_size", processor.feature_extractor.feature_size)
print("feature_size", processor.feature_extractor.hop_length)
print("feature_size", processor.feature_extractor.win_length)
# processor.feature_extractor.normalize_means=False
audio_arrays = [x["array"] for x in audio_sample["audio"]]
inputs = processor(
    audio_arrays, sampling_rate=16_000, max_length=16_000, truncation=True
)
real_inputs = torch.Tensor(inputs["input_features"][0])
print(real_inputs.shape)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


feature_size 80
feature_size 10
feature_size 25
mel_specgram torch.Size([80, 15401])
torch.Size([15401, 80])


In [20]:
model_input = torch.Tensor(model_input)
for i in range(5):
    print(real_inputs[i][:5])
    print(model_input[i][:5])
    print()

tensor([-0.3214, -0.3214, -0.3238, -0.3238, -0.3277])
tensor([-2.0351, -2.0351, -2.0351, -2.0351, -2.0351])

tensor([-0.3214, -0.3214, -0.3238, -0.3238, -0.3277])
tensor([-2.0351, -2.0351, -2.0351, -2.0351, -2.0351])

tensor([-0.3214, -0.3214, -0.3238, -0.3238, -0.3277])
tensor([-2.0351, -2.0351, -2.0351, -2.0351, -2.0351])

tensor([-0.3214, -0.3214, -0.3238, -0.3238, -0.3277])
tensor([-2.0351, -2.0351, -2.0351, -2.0351, -2.0351])

tensor([-0.3214, -0.3214, -0.3238, -0.3238, -0.3277])
tensor([-2.0351, -2.0351, -2.0351, -2.0351, -2.0351])

