In [2]:
import os
import pandas as pd
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from torch import Tensor
import torchvision
from torchvision import datasets, transforms, models
import torch.optim as optim
from tqdm import tqdm
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from PIL import Image
import cv2
import random
# import torchaudio

In [2]:
mylist_1= os.listdir('data/')
type(mylist_1)
mylist=[]
for item in mylist_1:
    if item[-3:]=='wav':
        mylist.append(item)

In [3]:
#wav2spec
class ETRIDataset_spectram():
    def __init__(self, file_list, frame_length=0.025, frame_stride=0.010):
        self.file_list = file_list
        self.frame_length = frame_length
        self.frame_stride = frame_stride
    
    def __len__(self):
        return len(self.file_list)

    def __getitem__(self,index):
        audio_path = self.file_list[index]
        X, sample_rate = librosa.load('data/'+audio_path, res_type='kaiser_fast',duration=2.5,sr=16000,offset=0.0)
        signal = np.zeros((int(sample_rate *3,)))
        signal[:len(X)] = X
        sample_rate = sample_rate
        input_nfft = int(round(sample_rate*self.frame_length))
        input_stride = int(round(sample_rate*self.frame_stride))

        S = librosa.feature.melspectrogram(y=X, n_mels=64, n_fft=input_nfft, hop_length=input_stride)
        P = librosa.power_to_db(S, ref=np.max)
        

        ## get label
        if audio_path[-3:] == 'wav':
            if audio_path[7:8] =='a':
                label = 0
            elif audio_path[7:8] =='n':
                label = 1
            elif audio_path[7:8] =='s':
                label = 2
            elif audio_path[7:8] =='h':
                label = 3
        else:
            label=None
        return P, label

In [4]:
# train,test,val split
train_size = int(0.7*len(mylist))

val_size = int(0.2*len(mylist))
test_size = int(len(mylist)-train_size-val_size)
train_set, val_set, test_set = torch.utils.data.random_split(ETRIDataset_spectram(mylist),[train_size,val_size,test_size])
print(train_size)
print(val_size)
print(test_size)

2015
576
289


In [5]:
# spec2img
def getimg(dataset,feature):
        img_path=[]
        labels=[]
        for i in range(len(dataset)):
                fig = plt.figure()
                ax = fig.add_subplot(111)
                p = librosa.display.specshow(dataset[i][0],ax=ax, sr=16000, hop_length=int(round(16000*0.025)), x_axis='time',y_axis='linear')
                extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())

                fig.savefig('./image/%s_%d_%s.jpg' % (dataset[i][1],i,feature), bbox_inches=extent)
                img_path.append('./image/%s_%d_%s.jpg' % (dataset[i][1],i,feature))
                labels.append(dataset[i][1])
                plt.ioff()
                plt.close()
        return img_path , labels


In [6]:
train_path , train_labels = getimg(train_set,'train')
val_path , val_labels = getimg(val_set,'val')
test_path , test_labels = getimg(test_set,'test')

In [7]:
train_transforms = transforms.Compose([transforms.RandomRotation(30),
                                       transforms.RandomResizedCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.485, 0.456, 0.406],
                                                            [0.229, 0.224, 0.225])])
test_transforms = transforms.Compose([transforms.Resize(255),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406],
                                                           [0.229, 0.224, 0.225])])
                                                           

In [8]:
class img2tensor():
    def __init__(self,data_path,labels,transforms):
        self.data_path = data_path
        self.labels = labels
        self.transforms = transforms

    def __len__(self):
        return len(self.data_path)

        
    def __getitem__(self, index):
        img_path = self.data_path[index]
        image = Image.open(img_path)
        I = train_transforms(image)
        label = self.labels[index]

        return I, label
        
        

In [9]:
# set batch_size
batch_size = 16
# dataloader
train_dataloader = torch.utils.data.DataLoader(img2tensor(train_path,train_labels,train_transforms), batch_size=batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(img2tensor(val_path,val_labels,test_transforms), batch_size=batch_size, shuffle=False)
dataloaders_dict ={'train':train_dataloader, 'val': val_dataloader}
# test
batch_iterator = iter(dataloaders_dict['train'])
inputs, labels = next(batch_iterator)
print(inputs.size())
print(labels)

torch.Size([16, 3, 224, 224])
tensor([1, 0, 3, 1, 1, 2, 3, 2, 2, 2, 2, 3, 3, 1, 2, 3])


In [4]:
# model = torchvision.models.densenet121(pretrained=True)
model = torchvision.models.resnet34(pretrained=True)
model.to(torch.device('cuda'))
model.classifier = nn.Linear(in_features=1000, out_features=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters() ,lr=0.00001, weight_decay=1e-6, momentum=0.9)
model.train()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [5]:


def train(net, dataloader, criterion, optimizer, num_epochs):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)
    model.to(device)
    torch.backends.cudnn.benchmark = True

    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1,num_epochs))
        print('-------------------------------')

        for phase in ['train','val']:
            if phase == 'train':
                net.train()
            else:
                net.eval()
            epoch_loss = 0.0
            epoch_corrects = 0

            if (epoch == 0) and(phase == 'train'):
                continue
            for inputs, labels in tqdm(dataloader[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs,1)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    epoch_loss += loss.item() *inputs.size(0)

                    epoch_corrects += torch.sum(preds == labels.data)

                    epoch_loss = epoch_loss / len(dataloader[phase].dataset)
                    epoch_acc = epoch_corrects.double() / len(dataloader[phase].dataset)


            print('{} Loss: {:.4f} ACC {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
                

In [6]:
train(model, dataloaders_dict, criterion,optimizer, num_epochs=200)

NameError: name 'dataloaders_dict' is not defined

In [13]:
# save weight
save_path = './saved_models/DenseNet121_img.pth'
torch.save(model.state_dict(),save_path)

In [14]:
# load weight
load_path = './saved_models/DenseNet121_img.pth'
load_weights = torch.load(load_path, map_location={'cuda:0': 'cpu'})
model = torchvision.models.densenet121(pretrained=True)
# first_conv_layer = [nn.Conv2d(1, 3, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True)]
# first_conv_layer.extend(list(model.features))  
# model.features= nn.Sequential(*first_conv_layer )  
model.classifier = nn.Linear(in_features=1024, out_features=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters() ,lr=0.0001, weight_decay=1e-6, momentum=0.9)
model.eval()
model.load_state_dict(load_weights)

<All keys matched successfully>

In [15]:
class Predictor(object):
    def __init__(self, model, device ='cpu',  fp16=False ):
        self.model = model
        
        self.cls_name = {0:'angry', 1:'happy', 2:'sad', 3:'neutral'}
        self.device = device
        

    def predict(self, audio):
        
        audio_info = audio
        
        outputs = self.model(audio_info)
        probability = torch.softmax(outputs,1)
        probability = probability.squeeze()
        proba, idx = torch.max(probability, dim=0)
        emo_proba = proba.item()
        print(emo_proba)
        idx = idx.item()
        emo_label = self.cls_name[idx]
        print(emo_label)
        return emo_label



In [36]:
def swdata(emotion):
    swlist_1 = os.listdir('swdata/%s/' % emotion)
    type(swlist_1)
    swlist = []
    for item in swlist_1:
        if item[-3:] == 'wav':
            swlist.append('./swdata/%s/' % emotion + item)
    return swlist

In [40]:
sw_list = swdata('angry') + swdata('happy') + swdata('sad') + swdata('neutral')

random.shuffle(sw_list)

In [53]:
#wav2spec
class swdata_spectram():
    def __init__(self, file_list, frame_length=0.025, frame_stride=0.010):
        self.file_list = file_list
        self.frame_length = frame_length
        self.frame_stride = frame_stride
    
    def __len__(self):
        return len(self.file_list)

    def __getitem__(self,index):
        audio_path = self.file_list[index]
        X, sample_rate = librosa.load(audio_path, res_type='kaiser_fast',duration=2.5,sr=16000,offset=0.0)
        signal = np.zeros((int(sample_rate *3,)))
        signal[:len(X)] = X
        sample_rate = sample_rate
        input_nfft = int(round(sample_rate*self.frame_length))
        input_stride = int(round(sample_rate*self.frame_stride))

        S = librosa.feature.melspectrogram(y=X, n_mels=64, n_fft=input_nfft, hop_length=input_stride)
        P = librosa.power_to_db(S, ref=np.max)
        

        ## get label
        if audio_path[-3:] == 'wav':
            if audio_path[9:10] =='a':
                label = 0
            elif audio_path[9:10] =='n':
                label = 1
            elif audio_path[9:10] =='s':
                label = 2
            elif audio_path[9:10] =='h':
                label = 3
        else:
            label=None
        return P, label

In [57]:
test_swdata = swdata_spectram(sw_list)
test_path , test_labels = getimg(test_swdata,'test_sw')


In [58]:
test_path

['./image/0_0_test_sw.jpg',
 './image/1_1_test_sw.jpg',
 './image/0_2_test_sw.jpg',
 './image/3_3_test_sw.jpg',
 './image/0_4_test_sw.jpg',
 './image/2_5_test_sw.jpg',
 './image/0_6_test_sw.jpg',
 './image/1_7_test_sw.jpg',
 './image/0_8_test_sw.jpg',
 './image/1_9_test_sw.jpg',
 './image/1_10_test_sw.jpg',
 './image/0_11_test_sw.jpg',
 './image/0_12_test_sw.jpg',
 './image/1_13_test_sw.jpg',
 './image/0_14_test_sw.jpg',
 './image/0_15_test_sw.jpg',
 './image/2_16_test_sw.jpg',
 './image/2_17_test_sw.jpg',
 './image/1_18_test_sw.jpg',
 './image/3_19_test_sw.jpg',
 './image/2_20_test_sw.jpg',
 './image/0_21_test_sw.jpg',
 './image/1_22_test_sw.jpg',
 './image/1_23_test_sw.jpg',
 './image/2_24_test_sw.jpg',
 './image/1_25_test_sw.jpg',
 './image/1_26_test_sw.jpg',
 './image/2_27_test_sw.jpg',
 './image/1_28_test_sw.jpg',
 './image/2_29_test_sw.jpg',
 './image/2_30_test_sw.jpg',
 './image/1_31_test_sw.jpg',
 './image/0_32_test_sw.jpg',
 './image/1_33_test_sw.jpg',
 './image/2_34_test_sw.j

In [60]:
img2tensor(test_path,test_labels,test_transforms)[1][0].unsqueeze(0).shape

torch.Size([1, 3, 224, 224])

In [61]:
predictor = Predictor(model)
device='cpu'
a=[]
b=[]
for i in range(len(img2tensor(test_path,test_labels,test_transforms))):
    a.append(predictor.predict(img2tensor(test_path,test_labels,test_transforms)[i][0].unsqueeze(0)))
    b.append(img2tensor(test_path,test_labels,test_transforms)[i][1])


0.97287917137146
angry
0.47407740354537964
sad
0.6134857535362244
angry
0.9966476559638977
sad
0.8344489932060242
angry
0.98235023021698
sad
0.9849558472633362
angry
0.9999333620071411
angry
0.6899652481079102
happy
0.9631986618041992
angry
0.7478828430175781
happy
0.5698075890541077
neutral
0.7549975514411926
sad
0.622043251991272
angry
0.9752891063690186
sad
0.574131965637207
angry
0.4281877279281616
angry
0.9951044321060181
sad
0.5460681915283203
neutral
0.9759037494659424
angry
0.9977781176567078
sad
0.8374400734901428
happy
0.6592076420783997
happy
0.46127983927726746
sad
0.7378534078598022
angry
0.5901894569396973
happy
0.4655432105064392
happy
0.8803954124450684
angry
0.6913800835609436
angry
0.997877836227417
sad
0.8824412226676941
sad
0.6370543241500854
neutral
0.988271176815033
angry
0.9982762336730957
angry
0.9966848492622375
sad
0.9987173080444336
sad
0.9990296363830566
angry
0.5206223726272583
happy
0.5789490938186646
angry
0.955120861530304
angry
0.5529403686523438
angry


In [62]:
df = pd.DataFrame(a,columns=['predict'])
df['original']=b
for i in range(len(df)):
    if df['original'][i] == 0:
        df['original'][i] = 'angry'
    elif df['original'][i] == 1:
        df['original'][i] = 'happy'
    elif df['original'][i] == 2:
        df['original'][i] = 'sad'
    elif df['original'][i] == 3:
        df['original'][i] = 'neutral'
print('accuracy={}'.format((df['predict']==df['original']).sum()/len(df)))

accuracy=0.29213483146067415


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['original'][i] = 'angry'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [63]:
df.to_csv('predict_result_img_sw.csv',index=False)

In [64]:
import seaborn as sns


Unnamed: 0,predict,original
0,angry,angry
1,sad,happy
2,angry,angry
3,sad,neutral
4,angry,angry
...,...,...
84,angry,neutral
85,sad,sad
86,sad,neutral
87,sad,angry
