In [3]:
# some standard imports
import torch
from torch.autograd import Variable



In [4]:
# import the checkpoint API
from torch.utils.checkpoint import checkpoint_sequential
import torch.nn as nn

# create a simple Sequential model
model = nn.Sequential(
    nn.Linear(100, 50),
    nn.ReLU(),
    nn.Linear(50, 20),
    nn.ReLU(),
    nn.Linear(20, 5),
    nn.ReLU()
)


In [8]:

# create the model inputs
input_var = Variable(torch.randn(1, 100), requires_grad=True)
print(input_var.shape)
# set the number of checkpoint segments
segments = 2

torch.Size([1, 100])


In [6]:
model._modules.items()

odict_items([('0', Linear(in_features=100, out_features=50, bias=True)), ('1', ReLU()), ('2', Linear(in_features=50, out_features=20, bias=True)), ('3', ReLU()), ('4', Linear(in_features=20, out_features=5, bias=True)), ('5', ReLU())])

In [16]:


# get the modules in the model. These modules should be in the order
# the model should be executed
modules = [module for k, module in model._modules.items()]
#print(modules)

# now call the checkpoint API and get the output
out = checkpoint_sequential(modules, segments, input_var)
print(out)
# run the backwards pass on the model. For backwards pass, for simplicity purpose, 
# we won't calculate the loss and rather backprop on out.sum()
model.zero_grad()
print("out.sum() = ", out.sum())
out.sum().backward()

# now we save the output and parameter gradients that we will use for comparison purposes with
# the non-checkpointed run.
output_checkpointed = out.data.clone()
grad_checkpointed = {}
for name, param in model.named_parameters():
#     print("name = ", name)
#     print("param = ", param)
    grad_checkpointed[name] = param.grad.data.clone()

#print("grad_checkpointed = ",grad_checkpointed)

tensor([[0.0000, 0.0000, 0.0000, 0.1500, 0.0000]], grad_fn=<ReluBackward0>)
out.sum() =  tensor(0.1500, grad_fn=<SumBackward0>)


## RNN

In [17]:

# define the model 
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)
        self.nhid = nhid
        self.nlayers = nlayers

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

### Potential Useful ex

In [None]:
import torch
import torch.nn as nn
import torch.utils.checkpoint as checkpoint

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, 10)

    def forward(self, x):
        def lstm_forward(x):
            h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(x.device)
            c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(x.device)
            out, _ = self.lstm(x, (h0, c0))
            return out
        out = checkpoint.checkpoint(lstm_forward, x)
        out = self.fc(out[-1, :, :])
        return out


In [18]:
import torch
import torch.nn as nn
import torch.utils.checkpoint as checkpoint
import torchaudio

class Wav2Vec_1(nn.Module):
    def __init__(self, feature_extractor, classifier):
        super(Wav2Vec, self).__init__()
        self.feature_extractor = feature_extractor
        self.classifier = classifier

    def forward(self, x):
        def feature_extractor_forward(x):
            features = self.feature_extractor(x)
            return features
        features = checkpoint.checkpoint(feature_extractor_forward, x)
        out = self.classifier(features)
        return out

feature_extractor = torchaudio.models.wav2vec2.base(pretrained=True)
classifier = nn.Sequential(
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Linear(512, 10)
)
model = Wav2Vec(feature_extractor, classifier)


AttributeError: module 'torchaudio.models.wav2vec2' has no attribute 'base'

In [1]:
import torch.nn as nn
import torch.utils.checkpoint as checkpoint
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
from transformers import Wav2Vec2Processor, Wav2Vec2ConformerForCTC
from transformers import Wav2Vec2ConformerForSequenceClassification, Wav2Vec2Tokenizer
from transformers import AutoProcessor
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

class Wav2Vec_2(nn.Module):
    def __init__(self, feature_extractor, classifier):
        super(Wav2Vec, self).__init__()
        self.feature_extractor = feature_extractor
        self.classifier = classifier

    def forward(self, x):
        def feature_extractor_forward(x):
            features = self.feature_extractor(x).last_hidden_state
            return features
        features = checkpoint.checkpoint(feature_extractor_forward, x)
        out = self.classifier(features)
        return out

#feature_extractor = transformers.Wav2Vec2Model.from_pretrained('facebook/wav2vec2-large-960h-lv60-self')
classifier = nn.Sequential(
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Linear(512, 10)
)
pretrained_model = Wav2Vec2ConformerForSequenceClassification.from_pretrained('facebook/wav2vec2-conformer-rope-large-960h-ft',output_hidden_states=False ,output_attentions=False,return_dict=True,num_labels = 8)
feature_extractor = AutoProcessor.from_pretrained("facebook/wav2vec2-conformer-rope-large-960h-ft",sampling_rate = 16000,return_tensors="pt",padding= "longest")
model_test = Wav2Vec_2(feature_extractor, classifier)

Some weights of the model checkpoint at facebook/wav2vec2-conformer-rope-large-960h-ft were not used when initializing Wav2Vec2ConformerForSequenceClassification: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2ConformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ConformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ConformerForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-conformer-rope-large-960h-ft and are newly initialized: ['classifier.weight', 'classifier.bias', 'projector.bias', 'projector.weight']
You should probably T

NameError: name 'Wav2Vec' is not defined

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting fairseq
  Downloading fairseq-0.12.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.0 MB)
[K     |████████████████████████████████| 11.0 MB 8.1 MB/s eta 0:00:01
Collecting omegaconf<2.1
  Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)
Collecting hydra-core<1.1,>=1.0.7
  Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 120.6 MB/s eta 0:00:01
[?25hCollecting sacrebleu>=1.4.12
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 116.4 MB/s eta 0:00:01
[?25hCollecting bitarray
  Downloading bitarray-2.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (271 kB)
[K     |████████████████████████████████| 271 kB 119.8 MB/s eta 0:00:01
Collecting antlr4-python3-runtime==4.8
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[K     |████████████████████████████████| 112

In [6]:
import torch
import torch.nn as nn
from transformers import BertModel 
from transformers import Wav2Vec2ConformerConfig, Wav2Vec2ConformerModel
configuration = Wav2Vec2ConformerConfig()
print(configuration)

Wav2Vec2ConformerConfig {
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "conformer_conv_dropout": 0.1,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_depthwise_kernel_size": 31,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_norm": "group",
  "feat_proj_dropout": 0.0,
  "feat_quantizer_dropout": 0.0,
  "final_dropout": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inte

In [1]:
import torch

# Create a tensor
tensor = torch.randn(4, 47, 1024)

# Extract the tensors from the first and second dimension
tensor_2d = tensor.reshape(4, -1)[:, :1024]

# Print the shape of the new tensor
print(tensor_2d.shape) # Output: torch.Size([4, 1024])


torch.Size([4, 1024])


In [2]:
import torch
import torch.nn as nn
from transformers import BertModel 
from transformers import Wav2Vec2ConformerConfig, Wav2Vec2ConformerModel
batch_size = 4
pr_tr_model = Wav2Vec2ConformerModel.from_pretrained("facebook/wav2vec2-conformer-rope-large-960h-ft")

Some weights of the model checkpoint at facebook/wav2vec2-conformer-rope-large-960h-ft were not used when initializing Wav2Vec2ConformerModel: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ConformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ConformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:


class MyModel(nn.Module):
    def __init__(self, pre_tr_model ,input_size = batch_size, hidden_size = 768 , num_classes = 8):
        super(MyModel, self).__init__()

        self.backbone = pre_tr_model
        #self.linear = nn.Linear(hidden_size , 1024)
        self.output = nn.Linear(1024, num_classes)

    def forward(self, input_ids, attention_mask = False):
        backbone_op = self.backbone(input_ids)
        print(backbone_op[0].shape)
        #backbone_op = self.backbone(input_ids, attention_mask=False).last_hidden_state
        backbone_op_reshp  = backbone_op[0].reshape(batch_size, -1)[:, :backbone_op[0].shape[2]]
        print(backbone_op_reshp.shape)
        
        #linear_output = self.linear(backbone_op_reshp)
        #print(linear_output.shape)
        output = self.output(backbone_op_reshp)
        print(output)
        print("output shape = " , output.shape)
        out_smax = nn.Softmax(dim = 1)
        out = out_smax(output)
        print("out = ",out)
        
        return output


In [12]:
t_model = MyModel(pr_tr_model).to('cuda')

In [13]:
x = torch.rand(4,15360).to('cuda')

In [14]:
o = t_model(x)

torch.Size([4, 47, 1024])
torch.Size([4, 1024])
tensor([[-0.0710, -0.0052, -0.2053,  0.0306, -0.1015,  0.1254,  0.0235,  0.1185],
        [-0.0797, -0.0488, -0.1649,  0.0861, -0.1126,  0.0800,  0.0008,  0.1160],
        [-0.0924, -0.0546, -0.1958,  0.0443, -0.1407,  0.0581,  0.0326,  0.0323],
        [-0.0916, -0.1021, -0.1760,  0.0835, -0.0999,  0.0666,  0.0530,  0.1047]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
output shape =  torch.Size([4, 8])


TypeError: __init__() got multiple values for argument 'dim'