In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
from torch.cuda import amp
import sklearn
import numpy as np

import glob
import pandas as pd
from tqdm import tqdm
import soundfile as sf

# metric

In [2]:
def roc_auc_multi_label(output, target, label = 1):
    output = output.numpy()
    target = target.numpy()

    if len(target.shape) == 1:
        target = target.astype(int)
        n_values = np.max(target) + 1
        target = np.eye(n_values)[target]
        
    # print(target)
    return sklearn.metrics.roc_auc_score(target, output, multi_class = 'ovr', average=None)[1]

# preprocess

In [3]:
import librosa
import numpy as np

def width_padding(array, desired_w):
    w = array.shape[1]
    if w > desired_w:
        return array[:, :desired_w]
    else:
        b = (desired_w - w) // 2
        bb = desired_w - b - w
        return np.pad(array, pad_width=((0, 0), (b, bb)), mode='constant')

def crop_or_pad(y, length, is_train=True, start=None):
    if len(y) < length:
        n_repeats = length // len(y)
        epsilon = length % len(y)
        
        y = np.concatenate([y]*n_repeats + [y[:epsilon]])
        
    elif len(y) > length:
        if not is_train:
            start = start or 0
        else:
            start = start or np.random.randint(len(y) - length)

        y = y[start:start + length]

    return y

def trim_and_pad(audio, max_samples):
    audio_length = audio.shape[0]
    if audio_length > max_samples:
        # trim long_data
        trim_length = audio_length - max_samples
        audio = audio[int(trim_length//2):int(max_samples+trim_length//2)]
    else:
        # n_repeats = max_samples // len(audio)
        # epsilon = max_samples % len(audio)
        
        # audio = np.concatenate([audio]*n_repeats + [audio[:epsilon]])
        # padding = int(max_samples - audio_length)
        # offset = int(padding // 2)
        # audio = np.pad(audio, (offset, max_samples - audio_length - offset), 'constant')

#         if len(audio) < max_samples:
#             n_repeats = max_samples // len(audio)
#             epsilon = max_samples % len(audio)
            
#             audio = np.concatenate([audio]*n_repeats + [audio[:epsilon]])
    
    return audio


def segments(audio, fs, segment_size_t=0.05):
    audio_len = len(audio)
    segment_size = int(segment_size_t * fs)  # segment size in samples
    # Break signal into list of segments in a single-line Python code
    segments = np.array([audio[x:x + segment_size] for x in
                         np.arange(0, audio_len, segment_size)])
    return segments

def remove_silent(audio, fs, segment_size_t, v2=False):
    normalized_segments = segments(audio, fs, segment_size_t)
    energies = np.array([(s**2).sum() / len(s) for s in normalized_segments])
    threshold = 0.4 * np.median(energies)
    index_of_segments_to_keep = (np.where(energies > threshold)[0])
    # get segments that have energies higher than a the threshold:
    high_energy_segments = normalized_segments[index_of_segments_to_keep]
    try:
        return np.concatenate(high_energy_segments)
    except:
        return audio

def extract_mfcc_feature(audio, fs, mfcc_config, audio_transforms=None, for_test=False):
    # n_mfcc=15
    # n_fft=1024
    # hop_length= 256
    # max_samples = int(7.5 * 8000) # 7.5s

    # do_remove_silent = mfcc_config.get("do_remove_silent", False)
    n_mfcc = mfcc_config.get("n_mfcc", 15)
    n_fft = mfcc_config.get("n_fft", 1024)
    hop_length = mfcc_config.get("hop_length", 256)
    max_duration = mfcc_config.get("max_duration", 15)
    target_sr = mfcc_config.get("target_sr", 48000)
    max_samples = int(max_duration * target_sr)
    if for_test:
        # if it's the test set -> do remove silent and  resample
        audio = remove_silent(audio, fs, segment_size_t=0.025)
        audio = librosa.resample(audio, fs, target_sr)
        fs = target_sr
    if audio_transforms is not None:
        try:
            audio, fs = audio_transforms(audio, fs)
        except:
            audio = audio_transforms(samples=audio, sample_rate=fs)
    audio = trim_and_pad(audio, max_samples)
    mfcc_feature = librosa.feature.mfcc(y=audio, sr=fs, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    return mfcc_feature[None, ...].astype(np.float64)

def extract_feature(audio, fs, segment_size_t=0.025, n_mfcc=26, n_fft=256, hop_length=40, audio_transfroms=None):
    # audio = remove_silent(audio, fs, segment_size_t)
    if audio_transfroms is not None:
        try:
            audio, fs = audio_transfroms(audio, fs)
        except:
            audio = audio_transfroms(samples=audio, sample_rate=fs)
    mfcc_feature = librosa.feature.mfcc(y=audio, sr=fs, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    mfcc_delta = librosa.feature.delta(mfcc_feature)
    mfcc_delta2 = librosa.feature.delta(mfcc_delta, order=2)
    # zero crossing rate
    zcr = librosa.feature.zero_crossing_rate(y=audio, frame_length=n_fft, hop_length=hop_length)
    stft = librosa.stft(y=audio, n_fft=n_fft, hop_length= hop_length)
    chroma_stft = librosa.feature.chroma_stft(y=audio, n_fft=n_fft, hop_length=hop_length)
    return np.concatenate([mfcc_feature, mfcc_delta, mfcc_delta2, zcr, stft, chroma_stft])


def mfcc_feature(audio, fs, audio_transforms=None):
    segment_size_t=0.025
    n_mfcc=39
    n_fft=256
    num_seg = 128
    hop_length=len(audio)//num_seg

    feature = extract_feature(audio,
                             fs,
                             segment_size_t,
                             n_mfcc,
                             n_fft,
                             hop_length,
                             audio_transfroms=audio_transforms)
    # padding or trucate to the same width
    feature = width_padding(feature, num_seg)
    return feature[None, ...].astype(np.float64)

In [10]:
mfcc_config = {
    "do_remove_silent": True,
    "n_mfcc": 15,
    "n_fft": 1024, 
    "hop_length": 256, 
    "target_sr": 48000, 
    "max_duration": 15
}
audio_transforms = None

# model

In [11]:
class TimmBackbone(nn.Module):
    def __init__(self, model_name, inchannels=3, num_classes=1, pretrained=True):
        super().__init__()
        self.backbone = timm.create_model(model_name, in_chans=inchannels, pretrained=pretrained)
        n_features = self.backbone.num_features
        self.drop = nn.Dropout(0.5)
        self.fc1 = nn.Linear(n_features, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
    def freeze(self):
        # pass
        # print("freeze feature_extractor")
        for param in self.backbone.parameters():
            param.require_grad = False

    def unfreeze(self):
        # pass
        for param in self.backbone.parameters():
            param.require_grad = True

    def forward(self, x, fp16=False):
        with amp.autocast(enabled=fp16):
            x = x.float()
            feats = self.backbone.forward_features(x)
            x = self.pool(feats).view(x.size(0), -1)
            x = F.relu(self.fc1(x))
            x = self.drop(x)
            x = self.fc2(x)
        return x

In [12]:
model_name='tf_efficientnet_b0_ns'
inchannels=1
num_classes=2
pretrained=False

model = TimmBackbone(model_name, inchannels=inchannels, num_classes=num_classes, pretrained=pretrained)

In [13]:
checkpoint_paths = []
checkpoint_path = '/home/hana/sonnh/Covid19_Cough_Classification/saved/models/11-Covid19-B0/0803_175037/model_best_fold1.pt'

In [14]:
state_dict = torch.load(checkpoint_path)
model.load_state_dict(state_dict)
model = model.eval()

In [15]:
device = torch.device('cuda:0')
model = model.to(device)

# train

In [4]:
df_train = pd.read_csv('/home/hana/sonnh/data/AICovidVN/coswara/info.csv')
print(len(df_train))
print(df_train.head())

1806
                           uuid  assessment_result
0  iV3Db6t1T8b7c5HQY2TwxIhjbzD3                  0
1  AxuYWBN0jFVLINCBqIW5aZmGCdu1                  0
2  C5eIsssb9GSkaAgIfsHMHeR6fSh1                  0
3  YjbEAECMBIaZKyfqOvWy5DDImUb2                  0
4  aGOvk4ji0cVqIzCs1jHnzlw2UEy2                  0


In [5]:
# audio_dir = '/home/hana/sonnh/data/AICovidVN/aicv115m_public_train_full/aicv115m_final_public_train/train_115M_final_rm_silent_2/'
# audio_dir = '/home/hana/sonnh/data/AICovidVN/aicv115m_public_train_full/aicv115m_final_public_train/public_train_audio_files/'
audio_dir = '/home/hana/sonnh/data/AICovidVN/coswara/audio_rm_slient/'

## mfcc

In [179]:
train_predicts = []
for uuid in tqdm(list(df_train['uuid'])):
    with torch.no_grad():
        audio_path = '{}/{}.wav'.format(audio_dir, uuid)
        audio, fs  = sf.read(audio_path, dtype="float32")
        audio = remove_silent(audio, fs, segment_size_t=0.025)
        audio = librosa.resample(audio, fs, 48000)
        audio = np.array(audio)
#         sf.write('temp.wav', audio, 48000)
#         audio, fs  = sf.read('temp.wav', dtype="float32")
        
        image = extract_mfcc_feature(audio, fs, mfcc_config, audio_transforms, for_test=False)
        image = torch.tensor(image).unsqueeze(0)
        image = image.to(device)
        predict = model(image)
        predict = torch.softmax(predict, dim = 1)
    train_predicts.append(predict)

  "Empty filters detected in mel frequency basis. "
100%|██████████| 900/900 [03:02<00:00,  4.94it/s]


## melspec

In [25]:
import cv2
import torch
import numpy as np
import random
import sys

import librosa
import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa.display
import numpy as np
import soundfile as sf
from  soundfile import SoundFile
import glob 
import cv2
from pathlib import Path
from skimage.transform import rescale, resize, downscale_local_mean

In [63]:
def audio2melspec(audio, sr, melspec_config):    
    #melspectrogram
    # if audio_transfroms is not None:
    #     audio, original_sr = audio_transfroms(audio, original_sr)
    # n_fft = 512   
    # hop_length=int(len(audio)/256)

    # mel_spect = librosa.feature.melspectrogram(y=audio, sr=original_sr, n_fft=n_fft, hop_length=hop_length)
    # mel_spect = librosa.power_to_db(mel_spect, ref=np.max).astype(np.float32)
    # image = mono_to_color(mel_spect)
    # # new_img =  cv2.merge([image[:, :128], image[:, 128:256], image[:,256:384]])
    # new_img =  cv2.merge([image, image, image])
    # new_img = cv2.resize(new_img, (256, 128))

    IMAGE_WIDTH = melspec_config.get("width", 448)
    IMAGE_HEIGHT = melspec_config.get("height", 448)
    n_fft = melspec_config.get("n_fft", 2048)
    hop_length = melspec_config.get("hop_length", 'audo')
    win_length = n_fft#//2
#     sr = melspec_config.get("target_sr", 48000)

    if hop_length == 'auto':
        hop_length = int((len(audio) - win_length + n_fft) / IMAGE_WIDTH) + 1
    else:
        # print(hop_length)
        raise "Hop length must be compute with image width"

    spect = np.abs(librosa.stft(y=audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
    if spect.shape[1] < IMAGE_WIDTH:
        #print('too large hop length, len(clip)=', len(clip))
        hop_length = hop_length - 1
        spect = np.abs(librosa.stft(y=audio, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
    if spect.shape[1] > IMAGE_WIDTH:
        spect = spect[:, :IMAGE_WIDTH]
#     n_mels = IMAGE_HEIGHT // 2
    n_mels = IMAGE_HEIGHT
    spect = librosa.feature.melspectrogram(S=spect, sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=300, fmax=sr//2)
    spect = librosa.power_to_db(spect)
    # print(spect.shape)
    spect = resize(spect, (IMAGE_HEIGHT, IMAGE_WIDTH), preserve_range=True, anti_aliasing=True)
    spect = spect - spect.min()
    smax = spect.max()

    if smax >= 0.001:
        spect = spect / smax
    else:
        spect[...] = 0

    return spect

In [57]:
melspec_config = {
    "width": 448, "height":448, "n_fft": 2048, "hop_length": "auto", "target_sr": 48000, "max_duration": 15
}

In [32]:
df_train

Unnamed: 0,uuid,subject_age,subject_gender,audio_noise_note,cough_intervals,assessment_result,sample_rate,duration,duration_type,subject_age_type,noise_type,cough_interval_type,labels,fold
1,9341db3f-049a-4ceb-8438-87ca1618a18a,group_34_48,male,,"[{'start': 1.9051594202898552, 'end': 2.414901...",0,22050,6.741361,5,3,1,2,0_5_1_3_2,1.0
3,ff8c21a8-4d05-43d8-96ee-dd33bcd6461e,group_19_33,male,,"[{'end': 1.516455486542442, 'start': 1.1739296...",0,22050,4.010703,1,2,1,2,0_1_1_2_2,1.0
7,9a55aef7-ed77-45ab-976e-411aade1c783,group_19_33,male,"tiếng nói chuyện, trao đổi","[{'start': 2.393298134459697, 'end': 2.7450841...",0,48000,5.461333,3,2,0,2,0_3_0_2_2,1.0
21,bf19a7f1-fdff-42c7-a99d-40546b39f745,group_65_78,male,,"[{'end': 1.47575373406193, 'start': 0.69777486...",1,22050,7.338685,7,5,1,2,1_7_1_5_2,1.0
29,1c2e03d2-1285-42ed-bca2-b3f1ab94aa6b,group_65_78,male,,"[{'end': 1.262585034013605, 'start': 0.9351079...",0,48000,2.133333,0,5,1,2,0_0_1_5_2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4468,8e98b0ef-d164-4fc3-b28b-24f5b70b33c4,group_14_18,female,,"[{'start': 0.8291162349457271, 'end': 1.325240...",0,48000,3.669333,1,1,1,2,0_1_1_1_2,1.0
4481,e7a7732c-0085-499c-9bd8-13c3de35741e,group_19_33,male,,"[{'start': 1.034520547945205, 'end': 1.2332420...",0,48000,5.546667,3,2,1,2,0_3_1_2_2,1.0
4494,24ac8a4c-b2a2-4dd0-89ac-7ff8801e9558,group_19_33,male,,"[{'end': 2.5490455813953483, 'start': 1.668881...",0,48000,6.144000,4,2,1,2,0_4_1_2_2,1.0
4496,f744cd1c-8f0d-4043-b800-19e39d443161,group_19_33,male,,"[{'start': 1.9232463768115942, 'end': 2.529590...",0,48000,12.202667,7,2,1,2,0_7_1_2_2,1.0


In [58]:
import albumentations as albu
from albumentations.pytorch.transforms import ToTensorV2
image_transform = albu.Compose([
            ToTensorV2(p=1.0),
        ], p=1.)
max_samples = 48000 * 15

In [64]:
train_predicts = []
list_uuid = list(df_train['uuid'])
for uuid in tqdm(list_uuid):
    with torch.no_grad():
        audio_path = '{}/{}.wav'.format(audio_dir, uuid)
        audio, fs  = sf.read(audio_path, dtype="float32")
        audio = remove_silent(audio, fs, segment_size_t=0.025)
        audio = librosa.resample(audio, fs, 48000)
        audio = np.array(audio)
        
        
        sf.write('temp.wav', audio, 48000)
        audio, fs  = sf.read('temp.wav', dtype="float32")
        
        audio = trim_and_pad(audio, max_samples)
        image = audio2melspec(audio, fs, melspec_config)
        image = image_transform(image = image)['image']
        image = torch.tensor(image).unsqueeze(0)
        image = image.to(device)
        predict = model(image)
        predict = torch.softmax(predict, dim = 1)
    train_predicts.append(predict)

  "Empty filters detected in mel frequency basis. "
100%|██████████| 901/901 [03:02<00:00,  4.93it/s]


## metric

In [65]:
labels = torch.tensor(list(df_train['assessment_result']))
train_predicts = torch.cat(train_predicts)

In [66]:
train_predicts = train_predicts.cpu()

In [67]:
roc_auc_multi_label(train_predicts, labels)

0.9846658004356632

In [129]:
train_predicts_1 = train_predicts[:, 1].tolist()

In [130]:
sklearn.metrics.roc_auc_score(labels, train_predicts_1)

0.9153680064323205

In [56]:
sum(train_predicts>0.5)

tensor([758, 143])

In [57]:
train_predicts_1_r = np.array(train_predicts_1) >= 0.5

In [58]:
sklearn.metrics.recall_score(labels, train_predicts_1_r)

0.888

In [59]:
list(labels).count(1)

125

In [60]:
list(labels).count(0)

776

In [61]:
tp = sum(np.bitwise_and(labels, train_predicts_1_r))
tn = len(train_predicts_1_r) - sum(np.bitwise_or(labels, train_predicts_1_r))

In [62]:
tp

tensor(111)

In [33]:
sum(train_predicts_1_r)

141

In [38]:
tn

tensor(753)

In [32]:
sklearn.metrics.precision_score(labels, train_predicts_1_r)

0.9148936170212766

# test


In [69]:
audio_test_dir = '/home/hana/sonnh/data/AICovidVN/aicv115m_public_train_full/aicv115m_final_public_test/public_test_audio_files/'

In [70]:
df_test = pd.read_csv('/home/hana/sonnh/data/AICovidVN/aicv115m_public_train_full/aicv115m_final_public_test/public_test_sample_submission.csv')
df_test

Unnamed: 0,uuid,assessment_result
0,7b3797b0-3b7e-41e3-8b28-e2717eb55f8b,0
1,f0c466b3-7bf2-47e4-9e7f-f8cfc1783764,0
2,a2d668e9-d876-4bf6-bcb3-0cc32ba20c84,0
3,0edbea61-da70-44a4-8ee8-3681027944a6,0
4,1bcee200-1c33-4293-b1e9-5854210d92e8,0
...,...,...
1228,ab3f935b-3056-4a28-aa88-5823cfb0d30d,0
1229,bd8f4a34-33aa-40ad-a390-eb6bc9f04475,0
1230,ba5f136a-6c8e-4671-8d1a-5aeaf217bbc7,0
1231,634af752-a1ae-424d-b14c-cb2950950cac,0


In [158]:
predicts = []
for uuid in tqdm(list(df_test['uuid'])):
    with torch.no_grad():
        audio_path = '{}/{}.wav'.format(audio_test_dir, uuid)
        audio, fs  = sf.read(audio_path, dtype="float32")
        audio = remove_silent(audio, fs, segment_size_t=0.025)
        audio = librosa.resample(audio, fs, 48000)
        
        sf.write('temp.wav', audio, 48000)
        audio, fs  = sf.read('temp.wav', dtype="float32")

        image = extract_mfcc_feature(audio, fs, mfcc_config, audio_transforms, for_test=False)
        image = torch.tensor(image).unsqueeze(0)
        image = image.to(device)
        predict = model(image)
        predict = torch.softmax(predict, dim = 1)
    predicts.append(predict)

100%|██████████| 1233/1233 [04:50<00:00,  4.25it/s]


## melspec

In [71]:
predicts = []
for uuid in tqdm(list(df_test['uuid'])):
    with torch.no_grad():
        audio_path = '{}/{}.wav'.format(audio_test_dir, uuid)
        audio, fs  = sf.read(audio_path, dtype="float32")
        audio = remove_silent(audio, fs, segment_size_t=0.025)
        audio = librosa.resample(audio, fs, 48000)
        audio = np.array(audio)
        
        
        sf.write('temp.wav', audio, 48000)
        audio, fs  = sf.read('temp.wav', dtype="float32")
        
        audio = trim_and_pad(audio, max_samples)
        image = audio2melspec(audio, fs, melspec_config)
        image = image_transform(image = image)['image']
        image = torch.tensor(image).unsqueeze(0)
        image = image.to(device)
        predict = model(image)
        predict = torch.softmax(predict, dim = 1)
    predicts.append(predict)

  "Empty filters detected in mel frequency basis. "
100%|██████████| 1233/1233 [04:28<00:00,  4.60it/s]


In [72]:
predicts = torch.cat(predicts).cpu()

In [73]:
final_predicts = predicts[:, 1].tolist()

In [74]:
df_test['assessment_result'] = final_predicts

In [75]:
df_test.to_csv('/home/hana/sonnh/covid19_res/11_b0_fold1_fix_result.csv')

In [85]:
!pwd

/home/asilla/sonnh/covid


In [163]:
final_predicts = np.array(final_predicts)


In [164]:
sum(final_predicts>0.5)

63

In [165]:
df_test.head()

Unnamed: 0,uuid,assessment_result
0,7b3797b0-3b7e-41e3-8b28-e2717eb55f8b,0.194483
1,f0c466b3-7bf2-47e4-9e7f-f8cfc1783764,0.020574
2,a2d668e9-d876-4bf6-bcb3-0cc32ba20c84,0.016501
3,0edbea61-da70-44a4-8ee8-3681027944a6,0.013703
4,1bcee200-1c33-4293-b1e9-5854210d92e8,0.022322


In [184]:
audio_path

'/home/hana/sonnh/data/AICovidVN/aicv115m_public_train_full/aicv115m_final_public_train/public_train_audio_files//f616cb8d-370f-43e6-b459-121a9b987c94.wav'

In [170]:
audio_path = '{}/{}.wav'.format(audio_test_dir, uuid)
audio, fs  = sf.read(audio_path, dtype="float32")
audio = remove_silent(audio, fs, segment_size_t=0.025)
audio = librosa.resample(audio, fs, 48000)

print(audio)

[0.0000000e+00 3.0517578e-05 3.0517578e-05 ... 5.7983398e-04 6.1035156e-04
 6.1035156e-04]


In [172]:
np.array(audio)

array([0.0000000e+00, 3.0517578e-05, 3.0517578e-05, ..., 5.7983398e-04,
       6.1035156e-04, 6.1035156e-04], dtype=float32)

In [168]:
sf.write('temp.wav', audio, 48000)
audio, fs  = sf.read('temp.wav', dtype="float32")

In [183]:
2 == 'd'

False