In [1]:
def download_audio_video(dest, filename, youtube_id, start_time, end_time):
    sr = 16000
    link = 'https://www.youtube.com/watch?v='+youtube_id
    
    # download video stream and split into frames
    command = 'cd %s;' % dest
    command += 'ffmpeg -i $(youtube-dl -f ”mp4“ --get-url ' + link + ') ' + '-c:v h264 -c:a copy -ss %s -to %s %s.mp4;' \
               % (start_time, end_time, filename)
    command += 'ffmpeg -i %s.mp4 -vf fps=25 %s-%%02d.jpg;' % (filename, filename)
    os.system(command)
    
    # download audio stream
    command = 'cd %s;' % dest
    command += 'youtube-dl -x --audio-format wav -o o' + filename + '.wav ' + link + ';'
    command += 'ffmpeg -i o%s.wav -ar %d -ac 1 %s.wav;' % (filename,sr,filename)
    command += 'rm o%s.wav' % filename
    os.system(command);
    
    # time crop
    length = end_time - start_time
    command = 'cd %s;' % dest
    command += 'sox %s.wav preprocessed_%s.wav trim %s %s;' % (filename,filename,start_time,length)
    command += 'rm %s.wav' % filename
    os.system(command);
    preprocessed_filename = dest + '/preprocessed_'+ filename + '.wav';
    
    # normalize
    audio,_= lr.load(preprocessed_filename,sr=16000)
    max_amplitude = np.max(np.abs(audio))
    norm_audio = np.divide(audio,max_amplitude)
    wavfile.write(preprocessed_filename, 16000, norm_audio)

In [2]:
def bounding_box_check(faces,x,y):
    # check the center
    for face in faces:
        bounding_box = face['box']
        if(bounding_box[1]<0):
            bounding_box[1] = 0
        if(bounding_box[0]<0):
            bounding_box[0] = 0
        if(bounding_box[0]-50>x or bounding_box[0]+bounding_box[2]+50<x):
            continue
        if (bounding_box[1]-50 > y or bounding_box[1] + bounding_box[3]+50 < y):
            continue
        return bounding_box

In [3]:
def stft(data, fft_size=512, step_size=160):
    pad = np.zeros(192,)
    data = np.concatenate((data,pad),axis=0)
    window = np.concatenate((np.zeros((56,)),np.hanning(fft_size-112),np.zeros((56,))),axis=0)
    win_num = (len(data) - fft_size) // step_size
    out = np.ndarray((win_num, fft_size), dtype=data.dtype)
    for i in range(win_num):
        left = int(i * step_size)
        right = int(left + fft_size)
        out[i] = data[left: right] * window
    F = np.fft.rfft(out, axis=1)
    D = np.zeros((F.shape[0],F.shape[1],2))
    D[:,:,0] = np.real(F)
    D[:,:,1] = np.imag(F)
    return D

In [4]:
def complex_ratio_mask(mix, isolated):
    epsilon = 1e-8
    Yr = mix[:,:,0];
    Yi = mix[:,:,1];
    Sr = isolated[:,:,0];
    Si = isolated[:,:,1]
    mask_num_real = Yr*Sr + Yi*Si
    mask_num_imag = Yr*Si - Yi*Sr
    mask_den = np.square(Yr)+np.square(Yi)
    
    mask = np.zeros(np.shape(mix))
    mask[:,:,0] = mask_num_real / (mask_den + epsilon)
    mask[:,:,1] = mask_num_imag / (mask_den + epsilon)
    
    C = 0.1
    K = 10
    
    num = 1-np.exp(-C*mask)
    den = 1+np.exp(-C*mask)
    num[num == np.inf] = 1
    num[num == -np.inf] = -1
    den[den == np.inf] = 1
    den[den == -np.inf] = -1
    
    return K*num/den;

In [5]:
def face_detect(file,detector,frame_path,x,y):
    name = file.replace('.jpg', '').split('-')
    img = cv2.imread('%s%s'%(frame_path,file))
    x = img.shape[1] * x
    y = img.shape[0] * y
    faces = detector.detect_faces(img)
    # check if detected faces
    if(len(faces)==0):
        return #no face
    bounding_box = bounding_box_check(faces,x,y)
    if(bounding_box == None):
        return
    crop_img = img[bounding_box[1]:bounding_box[1] + bounding_box[3],bounding_box[0]:bounding_box[0]+bounding_box[2]]
    crop_img = cv2.resize(crop_img,(160,160))
    return crop_img;

In [6]:
# prevent tensorflow from using GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [7]:
import pandas as pd;
import numpy as np;
import librosa as lr;
import logging;
import cv2;
import scipy.io.wavfile as wavfile;
from mtcnn import MTCNN;
from keras.models import load_model;
from keras.models import Model;

Using TensorFlow backend.


In [8]:
# create a temporary directory for the downloaded files
os.system("rm -rf tmp2; mkdir tmp2");
dest = 'tmp2';

# choose the data index to start processing at
curr_ind = 1;

# load the AVspeech URLs
# csv_data = pd.read_csv('train.csv');
csv_data = pd.read_csv('test.csv')

# initialize models
detector = MTCNN()
model = load_model('facenet_keras.h5');
avgPool_layer_model = Model(inputs=model.input,outputs=model.get_layer('AvgPool').output)





In [None]:
for i_mixture in range(1000):
    data_features = {
        "train": 0,
        "test" : 1,
        "ind"  : [],
        "faces": [],
        "av_pl": [],
        "emb"  : [],
        "stft" : [],
        "mix"  : [],
        "mask" : [],
    }
    for i_file in range(curr_ind,curr_ind+20):
        youtube_id = csv_data.loc[i_file,'link']
        start_time = csv_data.loc[i_file,'start_time']
        end_time   = csv_data.loc[i_file,'end_time']
        x = csv_data.loc[i_file,'pos_x']
        y = csv_data.loc[i_file,'pos_y']
        try:
            download_audio_video(dest, str(i_file), youtube_id, start_time, end_time)
            face_emb_av_pl = np.zeros((75,1792));
            face_emb = np.zeros((75,512));
            video = np.zeros((75,160,160,3));
            for i_frame in range(0,75):
                filename = str(i_file)+"-%02d.jpg"%(i_frame+1);
                crop_img = face_detect(filename, detector, "tmp2/", x, y);
                assert(np.shape(crop_img) == (160,160,3));
                face = crop_img[np.newaxis,:,:,:];
                video[i_frame,:,:,:] = face;
                face_emb_av_pl[i_frame,:] = avgPool_layer_model.predict(face);
                face_emb[i_frame,:] = model.predict(face);
            filename = "tmp2/preprocessed_" +str(i_file) + ".wav";
            x,sr = lr.load(filename,sr=16000, duration=3);
            data_features["mix"].append(x);
            data_features["stft"].append(stft(x));
            data_features["faces"].append(video);
            data_features["av_pl"].append(face_emb_av_pl);
            data_features["emb"].append(face_emb);
            data_features["ind"].append(i_file);
        except:
            logging.exception("Exception occured for file #" + str(i_file));
            continue;
        if len(data_features["ind"]) == 2:
            curr_ind = i_file + 1;
            break;
    data_features["mix"] = stft(data_features["mix"][0]+data_features["mix"][1]);
    data_features["mask"].append(complex_ratio_mask(data_features["mix"],data_features["stft"][0]))
    data_features["mask"].append(complex_ratio_mask(data_features["mix"],data_features["stft"][1]))
    
    save_filename = "data/"
    save_filename += "train" if data_features["train"] else "test"
    save_filename += "_" + str(data_features["ind"][0])
    save_filename += "_" + str(data_features["ind"][1])
    np.save(save_filename,data_features,allow_pickle=True)



ERROR:root:Exception occured for file #1
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError












ERROR:root:Exception occured for file #6
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #10
Traceback (most recent call last):
  File "/home/dan/.local/lib/python3.8/site-packages/librosa/core/audio.py", line 146, in load
    with sf.SoundFile(path) as sf_desc:
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 629, in __init__
    self._file = self._open(file, mode_int, closefd)
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 1183, in _open
    _error_check(_snd.sf_error(file_ptr),
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 1357, in _error_check
    raise RuntimeError(prefix + _ffi.string(err_str).decode('utf-8', 'replace'))
RuntimeError: Error opening 'tmp2/preprocessed_10.wav': System error.

During handling of the above exception, another exception occurred:
























ERROR:root:Exception occured for file #25
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError








































ERROR:root:Exception occured for file #30
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #31
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #32
Traceback (most recent call last):
  File "/home/dan/.local/lib/python3.8/site-packages/librosa/core/audio.py", line 146, in load
    with sf.SoundFile(path) as sf_desc:
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 629, in __init__
    self._file = self._open(file, mode_int, closefd)
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 1183, in _open
    _error_check(_snd.sf_error(file_ptr),
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 1357, in _error_check
    raise RuntimeE































ERROR:root:Exception occured for file #52
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #53
Traceback (most recent call last):
  File "/home/dan/.local/lib/python3.8/site-packages/librosa/core/audio.py", line 146, in load
    with sf.SoundFile(path) as sf_desc:
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 629, in __init__
    self._file = self._open(file, mode_int, closefd)
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 1183, in _open
    _error_check(_snd.sf_error(file_ptr),
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 1357, in _error_check
    raise RuntimeError(prefix + _ffi.string(err_str).decode('utf-8', 'replace'))
RuntimeError: Error opening 'tmp2/preprocessed_53.wav': System error.

During handling of the above exception, another exception occurred:



































ERROR:root:Exception occured for file #55
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 26, in <module>
    crop_img = face_detect(filename, detector, "tmp2/", x, y);
  File "<ipython-input-5-36d386d777aa>", line 4, in face_detect
    x = img.shape[1] * x
AttributeError: 'NoneType' object has no attribute 'shape'
ERROR:root:Exception occured for file #57
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #59
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #61
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #63
Trace



ERROR:root:Exception occured for file #66
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #69
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
















ERROR:root:Exception occured for file #72
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #73
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #74
Traceback (most recent call last):
  File "/home/dan/.local/lib/python3.8/site-packages/librosa/core/audio.py", line 146, in load
    with sf.SoundFile(path) as sf_desc:
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 629, in __init__
    self._file = self._open(file, mode_int, closefd)
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 1183, in _open
    _error_check(_snd.sf_error(file_ptr),
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 1357, in _error_check
    raise RuntimeE



















ERROR:root:Exception occured for file #75
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 27, in <module>
    assert(np.shape(crop_img) == (160,160,3));
AssertionError
ERROR:root:Exception occured for file #76
Traceback (most recent call last):
  File "<ipython-input-9-2ba69d501024>", line 26, in <module>
    crop_img = face_detect(filename, detector, "tmp2/", x, y);
  File "<ipython-input-5-36d386d777aa>", line 4, in face_detect
    x = img.shape[1] * x
AttributeError: 'NoneType' object has no attribute 'shape'
ERROR:root:Exception occured for file #78
Traceback (most recent call last):
  File "/home/dan/.local/lib/python3.8/site-packages/librosa/core/audio.py", line 146, in load
    with sf.SoundFile(path) as sf_desc:
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 629, in __init__
    self._file = self._open(file, mode_int, closefd)
  File "/home/dan/.local/lib/python3.8/site-packages/soundfile.py", line 1183, in _open
    _e