In [1]:
import os,sys
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from PIL import Image, ImageOps
from skimage import io
from skimage import color
from skimage import transform
from torchvision import transforms
import torch
import torch.nn as nn
import torch.nn.functional as F
from numpy import ndarray
from sklearn.model_selection import train_test_split 
import torch.utils.data as data_utils
import torchvision
import torch.optim as optim


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
#veri bilgilerinin bulunduğu csv dosyasını okuma
csv_path = r"C:\Users\emirh\Desktop\ai_hub_proje\data\UrbanSound8K\metadata\UrbanSound8K.csv"
csv_data = pd.read_csv(csv_path)
csv_data.tail(5)

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.20985,255.741948,2,7,1,car_horn
8731,99812-1-6-0.wav,99812,332.289233,334.821332,2,7,1,car_horn


In [4]:
#görüntünün veri yolunu input olarak alır ve sırasıyla;
#grayscale dönüşümü, resizing, normalization, [görüntü, etiket] formatına yazma
#işlemlerini yapar.
#[görüntü,etiket] formatında DataFrame'i oluşturur ve csv dosyası olarak kaydını yapar 


def preprocessor(directory):
    images = []
    labels = []

    for foldername in os.listdir(directory): #spectrograms klasöründeki classID pathlerini döndür
        f = os.path.join(directory, foldername) #classID klasörlerinin pathleri
        for imagename in os.listdir(f):
            imagepath = os.path.join(f, imagename)
            rgb_img = io.imread(imagepath)[:,:,:3]
            gray_img = color.rgb2gray(rgb_img)
            gray_img_resized = transform.resize(gray_img, (128,128)) #grayscaled fotoğraf
            image = np.array(gray_img_resized, 'float32')
            image = (image-image.mean())/image.std()
            label = int(foldername)    
            images.append(image)
            labels.append(label)
    return images, labels
    

In [5]:
directory = r'C:\Users\emirh\Desktop\ai_hub_proje\data\spectrograms'

images, labels = preprocessor(directory)

In [6]:
len(images), len(labels)

(8732, 8732)

In [7]:
images[0]

array([[ 3.6713247 , -0.28672266, -1.8196371 , ..., -1.8227298 ,
        -1.8227298 , -1.8227298 ],
       [ 3.677114  , -0.27035457, -1.8173009 , ..., -1.8204195 ,
        -1.8204175 , -1.8204162 ],
       [ 3.6771352 , -0.27029043, -1.8172846 , ..., -1.8203738 ,
        -1.8129799 , -1.8074372 ],
       ...,
       [ 3.6768723 , -0.20892416, -0.86748713, ...,  1.884726  ,
         1.6924425 ,  0.9927972 ],
       [ 3.7136507 ,  1.0344989 ,  0.2612872 , ...,  0.9487419 ,
         0.8893153 ,  0.67563593],
       [ 3.8196363 ,  3.7934604 ,  3.7889116 , ...,  3.7891667 ,
         3.7891512 ,  3.7890959 ]], dtype=float32)

In [8]:
images[0]

array([[ 3.6713247 , -0.28672266, -1.8196371 , ..., -1.8227298 ,
        -1.8227298 , -1.8227298 ],
       [ 3.677114  , -0.27035457, -1.8173009 , ..., -1.8204195 ,
        -1.8204175 , -1.8204162 ],
       [ 3.6771352 , -0.27029043, -1.8172846 , ..., -1.8203738 ,
        -1.8129799 , -1.8074372 ],
       ...,
       [ 3.6768723 , -0.20892416, -0.86748713, ...,  1.884726  ,
         1.6924425 ,  0.9927972 ],
       [ 3.7136507 ,  1.0344989 ,  0.2612872 , ...,  0.9487419 ,
         0.8893153 ,  0.67563593],
       [ 3.8196363 ,  3.7934604 ,  3.7889116 , ...,  3.7891667 ,
         3.7891512 ,  3.7890959 ]], dtype=float32)

In [9]:
labels[0]

0

In [10]:
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

In [11]:
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

In [12]:
print(X_train_np.shape)
print(X_test_np.shape)
print(y_train_np.shape)
print(y_test_np.shape)

(6985, 128, 128)
(1747, 128, 128)
(6985,)
(1747,)


In [13]:
X_train_t = torch.from_numpy(X_train_np).unsqueeze(-1).permute(0,3,1,2)
X_test_t = torch.from_numpy(X_test_np).unsqueeze(-1).permute(0,3,1,2)
y_train_t = torch.from_numpy(y_train_np)
y_test_t = torch.from_numpy(y_test_np)

In [14]:
X_train_t.shape, X_test_t.shape

(torch.Size([6985, 1, 128, 128]), torch.Size([1747, 1, 128, 128]))

In [15]:
y_train_t.shape, y_test_t.shape

(torch.Size([6985]), torch.Size([1747]))

In [16]:
train = data_utils.TensorDataset(X_train_t, y_train_t)
train_loader = data_utils.DataLoader(train, batch_size=128, shuffle=True)
test = data_utils.TensorDataset(X_test_t, y_test_t)
test_loader = data_utils.DataLoader(test, batch_size=128, shuffle=True)

In [17]:
#CNN modelini tanımlama
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=2)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=2)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=2)
        self.conv4 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=2)
        self.fc1 = nn.Linear(in_features=128*9*9, out_features=10)
        self.softmax = nn.Softmax(dim=1)
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = self.pool(F.relu(self.conv4(x)))
        x = torch.flatten(x,1)
        x = self.fc1(x)
        x = self.softmax(x)
        return x
net = Net()
net = net.to(device)



In [18]:
#Loss function ve optimizer tanımla
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [22]:
#Modelin eğitimi
epochs = 400

for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader,0):
        inputs, labelss = data
        labelss = labelss.type(torch.LongTensor)
        inputs = inputs.to(device)
        labelss = labelss.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labelss)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    if (epoch + 1) % 2 == 0:
        print(f'epoch{epoch+1}/{epochs}, loss={loss.item():.4f}')    
print('Finished Training')


epoch2/400, loss=1.7524
epoch4/400, loss=1.7048
epoch6/400, loss=1.6566
epoch8/400, loss=1.7315
epoch10/400, loss=1.7131
epoch12/400, loss=1.6800
epoch14/400, loss=1.6791
epoch16/400, loss=1.6526
epoch18/400, loss=1.5708
epoch20/400, loss=1.6140
epoch22/400, loss=1.6547
epoch24/400, loss=1.5988
epoch26/400, loss=1.5306
epoch28/400, loss=1.5843
epoch30/400, loss=1.6429
epoch32/400, loss=1.5588
epoch34/400, loss=1.5570
epoch36/400, loss=1.6382
epoch38/400, loss=1.5979
epoch40/400, loss=1.5681
epoch42/400, loss=1.5576
epoch44/400, loss=1.6247
epoch46/400, loss=1.5303
epoch48/400, loss=1.7082
epoch50/400, loss=1.5726
epoch52/400, loss=1.5563
epoch54/400, loss=1.5436
epoch56/400, loss=1.6296
epoch58/400, loss=1.5427
epoch60/400, loss=1.6251
epoch62/400, loss=1.5701
epoch64/400, loss=1.6115
epoch66/400, loss=1.6239
epoch68/400, loss=1.5703
epoch70/400, loss=1.6372
epoch72/400, loss=1.6252
epoch74/400, loss=1.5297
epoch76/400, loss=1.6253
epoch78/400, loss=1.5692
epoch80/400, loss=1.5564
epoc

In [40]:
PATH = r'C:\Users\emirh\Desktop\ai_hub_proje\urbansounds_net.pth'
torch.save(net.state_dict(), PATH)

In [45]:
#modelin test edilmesi
classes = ["air_conditioner", "car_horn", "children_playing", "dog_bark", "drilling", "engine_idling", "gun_shot", "jackhammer", "siren", "street_music"]
dataiter = iter(test_loader)
images, labels = dataiter.next()
print('GroundTruth: ', ' // '.join(f'{classes[labels[j]]:5s}' for j in range(10)))

GroundTruth:  drilling // jackhammer // siren // siren // engine_idling // car_horn // drilling // siren // siren // car_horn


In [46]:
net = Net()
net.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [47]:
outputs = net(images)

In [48]:
_, predicted = torch.max(outputs, 1)

print('Predicted: ', ' // '.join(f'{classes[predicted[j]]:5s}'
                              for j in range(10)))

Predicted:  drilling // jackhammer // siren // siren // engine_idling // street_music // drilling // siren // siren // street_music


In [29]:
# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for data in test_loader:
        images, labels = data
        outputs = net(images)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')

Accuracy for class: air_conditioner is 83.6 %
Accuracy for class: car_horn is 0.0 %
Accuracy for class: children_playing is 80.7 %
Accuracy for class: dog_bark is 86.0 %
Accuracy for class: drilling is 92.4 %
Accuracy for class: engine_idling is 86.0 %
Accuracy for class: gun_shot is 96.1 %
Accuracy for class: jackhammer is 94.8 %
Accuracy for class: siren is 89.7 %
Accuracy for class: street_music is 80.6 %
