In [45]:
import os
import pandas as pd
import sys
sys.path.insert(0, os.path.abspath('ComParE2022_VecNet/src'))
#import config,config_pytorch
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score ,confusion_matrix, classification_report

import math
import pickle

from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch
import torch.optim as optim
from datetime import datetime
import time


import matplotlib
import matplotlib.pyplot as plt



from tqdm.notebook import tqdm

import random
import torchaudio
import torchaudio.transforms as AT
import torchvision.transforms as VT
from torch.cuda.amp import autocast, GradScaler
#from timm.scheduler.cosine_lr import CosineLRScheduler
import timm
import timm.optim
from timm.models import model_parameters
from glob import glob
## nnAudio
from nnAudio import features
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import Dataset, DataLoader
import argparse
## DDp Import
import torch.distributed as dist
import warnings
warnings.filterwarnings("ignore")

import argparse
import deepspeed
from torch.utils.tensorboard import SummaryWriter 
import torch.profiler
from contextlib import ExitStack

In [54]:
class Expert(nn.Module):
    def __init__(self, input_size =8, hidden_size =8, output_size=8):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class Gate(nn.Module):
    def __init__(self, input_size=8, num_experts = 8):
        super().__init__()
        self.fc1 = nn.Linear(input_size, num_experts)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.softmax(x, dim=1)
        return x


In [58]:
class MyModel(nn.Module):
    def __init__(self, model_name, image_size,input_size =8, hidden_size=8, num_classes=8,num_experts =8):
        super().__init__()
        # num_classes=0 removes the pretrained head
        self.backbone = timm.create_model(model_name,
                        pretrained=True, num_classes=8, in_chans=1, 
                        drop_path_rate=0.2, global_pool='max',
                        drop_rate=0.25)
        #####  This section is model specific
        #### It freezes some fo the layers by name
        #### you'll have to inspect the model to see the names
                #### end layer freezing
        self.out = nn.Linear(self.backbone.num_features, 1)
        self.sizer = VT.Resize((image_size,image_size),antialias = True)
        self.spec_layer = features.STFT(n_fft=int(2048), freq_bins=None, hop_length=int(128),
                              window='hann', freq_scale='linear', center=True, pad_mode='reflect',
                           sr=8000, output_format="Magnitude", trainable=False,verbose = False)
        self.batch_norm = nn.BatchNorm2d(num_features= 1)
        self.experts = nn.ModuleList([Expert(input_size, hidden_size, num_classes) for _ in range(num_experts)])
        self.gate = Gate(input_size, num_experts)
        
        
        
    def forward(self, x,train = True):
        # first compute spectrogram
        warnings.filterwarnings("ignore", category=DeprecationWarning, module="nnAudio")
        spec_gram = self.spec_layer(x)
        print("post spec gram shape = ",spec_gram.shape)
        spec_gram = self.batch_norm(spec_gram.unsqueeze(dim = 1))
        print("post norm shape = ",spec_gram.shape)
        spec_gram_nan_check = torch.isnan(spec_gram).any().item()
        assert not (spec_gram_nan_check) ,"Tensor contains NaN values after spec gram creation."
        
        with torch.no_grad():
            if train == True:
                #generate a random number and if condition is met apply aug
                ta_transformations_rndm_choice = VT.RandomChoice([AT.FrequencyMasking(freq_mask_param=100),AT.TimeMasking(time_mask_param=50)], p=[.4, .4])
                ta_transformations_rndm_apply = VT.RandomApply([AT.FrequencyMasking(freq_mask_param=50),AT.TimeMasking(time_mask_param=25)],p = .15)
                spec_gram = ta_transformations_rndm_choice(spec_gram)
                spec_gram = ta_transformations_rndm_apply(spec_gram)
                spec_gram_nan_check = torch.isnan(spec_gram).any().item()
                assert not (spec_gram_nan_check) ,"Tensor contains NaN values after augmentations  "
                    
            
        
        x = self.sizer(spec_gram.squeeze(dim = 1))
        print("post sizer shape = ",x.shape)
        x = x.unsqueeze(dim = 1)
        print("post unsqueeze shape = ",x.shape)
        
        # then repeat channels
        del spec_gram,spec_gram_nan_check
                  
        x = self.backbone(x)
        backbone_op_nan_check = torch.isnan(x).any().item()
        assert not (backbone_op_nan_check) ,"Tensor contains NaN values in the backbone OP "
        print("x shape = " + str(x.shape))
        #print("x = " +str(x))
        #pred = nn.Softmax(x)
        expert_outputs = [expert(x) for expert in self.experts]
        print("expert_outputs = ",expert_outputs)
        expert_outputs = torch.stack(expert_outputs, dim=1)
        print("expert_outputs post stack = ",expert_outputs)
        print("post stack shape  = ",expert_outputs.shape)
        gate_outputs = self.gate(x)
        print(" gate_outputs= ",gate_outputs)
        print(" gate_outputs shape = ",gate_outputs.shape)
        
        gate_outputs = gate_outputs.unsqueeze(2)
        print("post unsqueeze gate_outputs =  ",gate_outputs.shape)
        weighted_sum = torch.bmm(expert_outputs, gate_outputs)
        print("weighted_sum = ",weighted_sum)
        
        #print(np.argmax(pred.detach().cpu().numpy()))
        #print(pred)
        
        #print(output)
        del x , backbone_op_nan_check,expert_outputs,gate_outputs
        return weighted_sum

In [59]:
x = torch.rand(1,15360)

In [60]:
model_b =MyModel('convnext_xlarge_in22k',224)

In [61]:
num_elements = [p.numel() for p in model_b.parameters()]

In [62]:
total_params = sum(num_elements)

In [63]:
total_params

348159443

In [64]:
o = model_b(x)

post spec gram shape =  torch.Size([1, 1025, 121])
post norm shape =  torch.Size([1, 1, 1025, 121])
post sizer shape =  torch.Size([1, 224, 224])
post unsqueeze shape =  torch.Size([1, 1, 224, 224])
x shape = torch.Size([1, 8])
expert_outputs =  [tensor([[-0.1601, -0.4489, -0.0433,  0.2854,  0.0722, -0.0035, -0.0689, -0.1482]],
       grad_fn=<AddmmBackward0>), tensor([[-0.1712, -0.2729, -0.3395,  0.2567, -0.1832, -0.0094, -0.2059, -0.2965]],
       grad_fn=<AddmmBackward0>), tensor([[-0.3129,  0.0536, -0.1088, -0.4036,  0.0566,  0.0638,  0.1453, -0.1662]],
       grad_fn=<AddmmBackward0>), tensor([[ 0.2269,  0.0128, -0.3687,  0.2167,  0.2184,  0.2583, -0.1777, -0.1274]],
       grad_fn=<AddmmBackward0>), tensor([[ 0.0029, -0.2734,  0.1919, -0.0383,  0.1120,  0.1098, -0.1129, -0.2748]],
       grad_fn=<AddmmBackward0>), tensor([[ 0.0019,  0.2014, -0.2976, -0.1182,  0.1901,  0.2917, -0.2344,  0.3787]],
       grad_fn=<AddmmBackward0>), tensor([[-0.0019,  0.2192,  0.1563,  0.2827,  0.284

In [65]:
print(o)

tensor([[[-0.0905],
         [-0.1756],
         [-0.0890],
         [ 0.0043],
         [-0.0442],
         [ 0.0535],
         [ 0.1331],
         [-0.0163]]], grad_fn=<BmmBackward0>)


In [39]:

class MoE(nn.Module):
    def __init__(self, input_size, hidden_size , num_classes , num_experts):
        super().__init__()
        self.experts = nn.ModuleList([Expert(input_size, hidden_size, num_classes) for _ in range(num_experts)])
        self.gate = Gate(input_size, num_experts)
        self.input = MyModel('convnext_xlarge_in22k',224)
        

    def forward(self, x):
        x = self.input(x)
        print("shape post cnn = ",x.shape)
        expert_outputs = [expert(x) for expert in self.experts]
        print("expert_outputs = ",expert_outputs)
        expert_outputs = torch.stack(expert_outputs, dim=1)
        print("expert_outputs post stack = ",expert_outputs)
        print("post stack shape  = ",expert_outputs.shape)
        
        gate_outputs = self.gate(x)
        print(" gate_outputs= ",gate_outputs)
        print(" gate_outputs shape = ",gate_outputs.shape)
        
        gate_outputs = gate_outputs.unsqueeze(2)
        print("post unsqueeze gate_outputs =  ",gate_outputs.shape)
        weighted_sum = torch.bmm(expert_outputs, gate_outputs)
        print("weighted_sum = ",weighted_sum)
        
        return weighted_sum.squeeze(2)

# Example usage



In [42]:
model = MoE(input_size=8, hidden_size=8, num_classes=8, num_experts=8)
input_data = torch.randn(1, 15360)
output = model(input_data)
print("output = ",output)  # (32, 5)
pred = torch.argmax(output, dim = 1)


post spec gram shape =  torch.Size([1, 1025, 121])
post norm shape =  torch.Size([1, 1, 1025, 121])
post sizer shape =  torch.Size([1, 224, 224])
post unsqueeze shape =  torch.Size([1, 1, 224, 224])
x shape = torch.Size([1, 8])
shape post cnn =  torch.Size([1, 8])
expert_outputs =  [tensor([[ 0.2929,  0.0893, -0.2214, -0.0515,  0.0937,  0.0526,  0.2732, -0.0616]],
       grad_fn=<AddmmBackward0>), tensor([[ 0.1425, -0.2046,  0.1789,  0.2661,  0.3432,  0.1691,  0.1548, -0.0121]],
       grad_fn=<AddmmBackward0>), tensor([[ 0.4719, -0.1176, -0.2985, -0.2853,  0.0560, -0.2683,  0.6084, -0.0822]],
       grad_fn=<AddmmBackward0>), tensor([[-0.0644, -0.1544, -0.1961,  0.0087,  0.0336, -0.2573,  0.2496,  0.1691]],
       grad_fn=<AddmmBackward0>), tensor([[-0.4718,  0.4125, -0.2638, -0.1281,  0.3371, -0.2907,  0.1157,  0.3006]],
       grad_fn=<AddmmBackward0>), tensor([[-0.3383, -0.2543,  0.2033,  0.1455,  0.0376,  0.0285, -0.2432, -0.3004]],
       grad_fn=<AddmmBackward0>), tensor([[-0.32

In [43]:
print(pred)

tensor([1])


In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Expert(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Expert, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x

class Gate(nn.Module):
    def __init__(self, input_size, num_experts):
        super(Gate, self).__init__()
        self.linear1 = nn.Linear(input_size, num_experts)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.softmax(x)
        return x

class MixtureOfExperts(nn.Module):
    def __init__(self, input_size =8 , hidden_size = 8, num_experts = 8):
        super(MixtureOfExperts, self).__init__()
        self.experts = nn.ModuleList([Expert(input_size, hidden_size) for i in range(num_experts)])
        self.gate = Gate(input_size, num_experts)
        self.input = MyModel('convnext_xlarge_in22k',224)
        
    def forward(self, x):
        x = self.input(x)
        gates = self.gate(x)
        print("shape after gates is = ",gates.shape)
        print("output of gates = ",gates)
        expert_outputs = [expert(x) for expert in self.experts]
        print("expert_outputs = ",expert_outputs)
        output = torch.stack(expert_outputs, dim=1)
        print("output post stack  = ",output)
        print("post stack shape   = ",output.shape)
        output = torch.bmm(gates.unsqueeze(1), output).squeeze(1)
        print("Final output = ",output)
        return output


In [37]:
moe_model = MixtureOfExperts()

In [38]:
moe_model(x)

post spec gram shape =  torch.Size([1, 1025, 121])
post norm shape =  torch.Size([1, 1, 1025, 121])
post sizer shape =  torch.Size([1, 224, 224])
post unsqueeze shape =  torch.Size([1, 1, 224, 224])
x shape = torch.Size([1, 8])
shape after gates is =  torch.Size([1, 8])
output of gates =  tensor([[0.1725, 0.0945, 0.0841, 0.1015, 0.1362, 0.1459, 0.1654, 0.0999]],
       grad_fn=<SoftmaxBackward0>)
expert_outputs =  [tensor([[0.0326]], grad_fn=<AddmmBackward0>), tensor([[-0.1055]], grad_fn=<AddmmBackward0>), tensor([[0.4171]], grad_fn=<AddmmBackward0>), tensor([[-0.0510]], grad_fn=<AddmmBackward0>), tensor([[0.2768]], grad_fn=<AddmmBackward0>), tensor([[0.2766]], grad_fn=<AddmmBackward0>), tensor([[0.6636]], grad_fn=<AddmmBackward0>), tensor([[-0.1350]], grad_fn=<AddmmBackward0>)]
output post stack  =  tensor([[[ 0.0326],
         [-0.1055],
         [ 0.4171],
         [-0.0510],
         [ 0.2768],
         [ 0.2766],
         [ 0.6636],
         [-0.1350]]], grad_fn=<StackBackward0>)


tensor([[0.1999]], grad_fn=<SqueezeBackward1>)