In [1]:
import os
import sys
import pandas as pd
import torch
import torchaudio
import torch.nn.functional as F
import torchaudio.transforms as T
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import json
import numpy as np
import math
import random
import seaborn as sns
sns.set_theme()

from typing import Optional

from datetime import timedelta

from src.utils import (
    create_dataset, plot_spectrogram,
    RandomClip, extract_logmel
)
from src.datasets import VoxCelebDataModule
from src.models import (
    SEBlock, SpeakerRecognitionModel, ResNetBlock, build_efficientnetv2,
    SEResNetBlock, conv1x1, conv3x3, ResNet34SE, ResNet20
)
from torch import nn
from sklearn.decomposition import PCA

from src.resnetse import ResNetSE, SEBasicBlock, ResNetSEV2

from src.losses import SubCenterAAMSoftmaxLoss
from sklearn.cluster import KMeans
from sklearn.metrics import roc_curve, accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

In [None]:
class ResNet20Seq(nn.Module):
    def __init__(
        self, 
        in_channels, 
        out_channels, 
        num_blocks
    ) -> None:
        super(ResNet20Seq, self).__init__()
        self.seq1 = nn.Sequential(
            nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=3,
                stride=2,
                padding=1
            ),
            nn.PReLU()
        )
        
        layers = []

        for _ in range(1, num_blocks):
            layers.append(
                nn.Conv2d(
                    in_channels=out_channels,
                    out_channels=out_channels,
                    kernel_size=3,
                    stride=1,
                    padding=1
                )
            )
            layers.append(
                nn.PReLU()
            )

        self.seq2 = nn.Sequential(*layers)

        for m in self.seq1.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_normal_(
                    m.weight
                )
                nn.init.constant_(
                    m.bias,
                    0
                )

        for m in self.seq2.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(
                    m.weight
                )
                nn.init.constant_(
                    m.bias,
                    0
                )
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out1 = self.seq1(x)
        out2 = self.seq2(out1)
        return out1 + out2

In [None]:
seq = ResNet20(num_classes=8)

In [None]:
a = torch.randn((4,1,80,301))

In [None]:
seq(a).shape

In [None]:
audiosize = 1500
max_audio = 500

np.array([np.int64(random.random()*(audiosize-max_audio))])

In [2]:
resnet34 = ResNet34SE(n_mels=40, num_classes=5)

In [3]:
a = torch.randn((4,1,40,301))

In [4]:
resnet34(a).shape

torch.Size([4, 512])