In [17]:
import os
import pandas as pd
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from torch import Tensor
import torchvision
from torchvision import datasets, transforms, models
import torch.optim as optim
from tqdm import tqdm
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from PIL import Image
import cv2
# import torchaudio

In [2]:
mylist_1= os.listdir('data/')
type(mylist_1)
mylist=[]
for item in mylist_1:
    if item[-3:]=='wav':
        mylist.append(item)

In [4]:
#wav2spec
class ETRIDataset_spectram():
    def __init__(self, file_list, frame_length=0.025, frame_stride=0.010):
        self.file_list = file_list
        self.frame_length = frame_length
        self.frame_stride = frame_stride
    
    def __len__(self):
        return len(self.file_list)

    def __getitem__(self,index):
        audio_path = self.file_list[index]
        X, sample_rate = librosa.load('data/'+audio_path, res_type='kaiser_fast',duration=2.5,sr=16000,offset=0.0)
        signal = np.zeros((int(sample_rate *3,)))
        signal[:len(X)] = X
        sample_rate = sample_rate
        input_nfft = int(round(sample_rate*self.frame_length))
        input_stride = int(round(sample_rate*self.frame_stride))

        S = librosa.feature.melspectrogram(y=X, n_mels=64, n_fft=input_nfft, hop_length=input_stride)
        P = librosa.power_to_db(S, ref=np.max)
        

        ## get label
        if audio_path[-3:] == 'wav':
            if audio_path[7:8] =='a':
                label = 0
            elif audio_path[7:8] =='n':
                label = 1
            elif audio_path[7:8] =='s':
                label = 2
            elif audio_path[7:8] =='h':
                label = 3
        else:
            label=None
        return P, label

In [5]:
# train,test,val split
train_size = int(0.7*len(mylist))

val_size = int(0.2*len(mylist))
test_size = int(len(mylist)-train_size-val_size)
train_set, val_set, test_set = torch.utils.data.random_split(ETRIDataset_spectram(mylist),[train_size,val_size,test_size])
print(train_size)
print(val_size)
print(test_size)

2015
576
289


In [32]:
# spec2img
def getimg(dataset,feature):
        img_path=[]
        labels=[]
        for i in range(len(dataset)):
                fig = plt.figure()
                ax = fig.add_subplot(111)
                p = librosa.display.specshow(dataset[i][0],ax=ax, sr=16000, hop_length=int(round(16000*0.025)), x_axis='time',y_axis='linear')
                extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())

                fig.savefig('./image/%s_%d_%s.jpg' % (dataset[i][1],i,feature), bbox_inches=extent)
                img_path.append('./image/%s_%d_%s.jpg' % (dataset[i][1],i,feature))
                labels.append(dataset[i][1])
                plt.ioff()
                plt.close()
        return img_path , labels


In [33]:
train_path , train_labels = getimg(train_set,'train')
val_path , val_labels = getimg(val_set,'val')
test_path , test_labels = getimg(test_set,'test')

In [35]:
train_transforms = transforms.Compose([transforms.RandomRotation(30),
                                       transforms.RandomResizedCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.485, 0.456, 0.406],
                                                            [0.229, 0.224, 0.225])])
test_transforms = transforms.Compose([transforms.Resize(255),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406],
                                                           [0.229, 0.224, 0.225])])
                                                           

In [47]:
class img2tensor():
    def __init__(self,data_path,labels,transforms):
        self.data_path = data_path
        self.labels = labels
        self.transforms = transforms

    def __len__(self):
        return len(self.data_path)

        
    def __getitem__(self, index):
        img_path = self.data_path[index]
        image = Image.open(img_path)
        I = train_transforms(image)
        label = self.labels[index]

        return I, label
        
        

In [48]:
# set batch_size
batch_size = 16
# dataloader
train_dataloader = torch.utils.data.DataLoader(img2tensor(train_path,train_labels,train_transforms), batch_size=batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(img2tensor(val_path,val_labels,test_transforms), batch_size=batch_size, shuffle=False)
dataloaders_dict ={'train':train_dataloader, 'val': val_dataloader}
# test
batch_iterator = iter(dataloaders_dict['train'])
inputs, labels = next(batch_iterator)
print(inputs.size())
print(labels)

torch.Size([16, 3, 224, 224])
tensor([2, 1, 1, 2, 0, 2, 0, 3, 0, 3, 2, 1, 1, 2, 3, 3])


In [49]:
model = torchvision.models.densenet121(pretrained=True)
model.to(torch.device('cuda'))
model.classifier = nn.Linear(in_features=1024, out_features=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters() ,lr=0.00001, weight_decay=1e-6, momentum=0.9)
model.train()

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [50]:
def train(net, dataloader, criterion, optimizer, num_epochs):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)
    model.to(device)
    torch.backends.cudnn.benchmark = True

    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1,num_epochs))
        print('-------------------------------')

        for phase in ['train','val']:
            if phase == 'train':
                net.train()
            else:
                net.eval()
            epoch_loss = 0.0
            epoch_corrects = 0

            if (epoch == 0) and(phase == 'train'):
                continue
            for inputs, labels in tqdm(dataloader[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs,1)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    epoch_loss += loss.item() *inputs.size(0)

                    epoch_corrects += torch.sum(preds == labels.data)

                    epoch_loss = epoch_loss / len(dataloader[phase].dataset)
                    epoch_acc = epoch_corrects.double() / len(dataloader[phase].dataset)


            print('{} Loss: {:.4f} ACC {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
                

In [51]:
train(model, dataloaders_dict, criterion,optimizer, num_epochs=200)

cuda:0
Epoch 1/200
-------------------------------


100%|██████████| 36/36 [00:04<00:00,  8.18it/s]


val Loss: 0.0386 ACC 0.2413
Epoch 2/200
-------------------------------


  0%|          | 0/126 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 5.93 GiB total capacity; 1.84 GiB already allocated; 35.25 MiB free; 1.86 GiB reserved in total by PyTorch)

In [21]:
# save weight
save_path = './saved_models/DenseNet121.pth'
torch.save(model.state_dict(),save_path)

In [76]:
# load weight
load_path = './saved_models/DenseNet121.pth'
load_weights = torch.load(load_path, map_location={'cuda:0': 'cpu'})
model = torchvision.models.densenet121(pretrained=False)
first_conv_layer = [nn.Conv2d(1, 3, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True)]
first_conv_layer.extend(list(model.features))  
model.features= nn.Sequential(*first_conv_layer )  
model.classifier = nn.Linear(in_features=1024, out_features=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters() ,lr=0.00001, weight_decay=1e-6, momentum=0.9)
model.eval()
model.load_state_dict(load_weights)

<All keys matched successfully>

In [77]:
class Predictor(object):
    def __init__(self, model, device ='cpu',  fp16=False ):
        self.model = model
        
        self.cls_name = {0:'angry', 1:'happy', 2:'sad', 3:'neutral'}
        self.device = device
        

    def predict(self, audio):
        
        audio_info = audio
        outputs = self.model(audio_info)
        probability = torch.softmax(outputs,1)
        probability = probability.squeeze()
        proba, idx = torch.max(probability, dim=0)
        emo_proba = proba.item()
        print(emo_proba)
        idx = idx.item()
        emo_label = self.cls_name[idx]
        print(emo_label)
        return emo_label



In [82]:
predictor = Predictor(model)
device='cpu'
a=[]
b=[]
for i in range(len(test_set)):
    a.append(predictor.predict(test_set[i][0].unsqueeze(0)))
    b.append(test_set[i][1])


0.967531144618988
happy
1.0
angry
1.0
angry
0.9585205912590027
happy
0.9955653548240662
neutral
1.0
angry
0.9999994039535522
sad
0.7945557832717896
happy
0.9910010695457458
angry
0.9856774806976318
happy
0.5194590091705322
angry
0.7366258502006531
neutral
0.9998733997344971
happy
0.9997814297676086
angry
1.0
sad
0.9999902248382568
sad
0.989194393157959
happy
1.0
sad
0.7373937964439392
neutral
0.9781458377838135
sad
0.9999630451202393
angry
0.9894214868545532
angry
0.9972381591796875
neutral
0.9613296985626221
angry
0.9229999780654907
angry
0.6239652037620544
happy
0.9993322491645813
neutral
0.9997630715370178
sad
0.9759746193885803
neutral
1.0
sad
0.7348594069480896
angry
0.9998894929885864
sad
0.9999997615814209
angry
0.5181540846824646
angry
0.9999192953109741
sad
0.999998927116394
sad
0.6212567687034607
neutral
0.9015434980392456
happy
1.0
angry
0.8217726945877075
happy
0.9999555349349976
sad
0.9998633861541748
neutral
0.9997188448905945
neutral
0.9999921321868896
angry
0.9999991655

In [123]:
df = pd.DataFrame(a,columns=['predict'])
df['original']=b
for i in range(len(df)):
    if df['original'][i] == 0:
        df['original'][i] = 'angry'
    elif df['original'][i] == 1:
        df['original'][i] = 'happy'
    elif df['original'][i] == 2:
        df['original'][i] = 'sad'
    elif df['original'][i] == 3:
        df['original'][i] = 'neutral'
print('accuracy={}'.format((df['predict']==df['original']).sum()/len(df)))

accuracy=0.7508650519031141


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['original'][i] = 'happy'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [120]:
df.to_csv('predict_result.csv',index=False)

In [None]:
1764345206