In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import itertools 
import os
from tqdm import tqdm
from subprocess import check_output
import cv2
import re
from PIL import Image
from scipy import ndimage
from pathlib import Path

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

import keras
from keras.models import Model
from keras.models import Sequential
from keras.utils import np_utils, to_categorical
from keras.callbacks import Callback
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras import optimizers
from keras import layers
from keras.layers import Dense, Conv2D, MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.regularizers import l2
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import VGG16

import IPython.display as ipd # Solo per Jupyter Notebook
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile as wav
from scipy.fftpack import rfft, fft, irfft, ifft, fftfreq
from scipy.signal import fftconvolve
import  tarfile
import urllib.request as request
import os, sys
from bs4 import BeautifulSoup
import random
import librosa
import sounddevice as sd

from keras.layers import LSTM, Input, Embedding, Conv1D, SpatialDropout1D
from keras.layers import MaxPooling1D, Dense, GRU, Flatten, Dropout, BatchNormalization, GlobalMaxPool1D
from keras.models import Model
from keras.optimizers import Adam, Adadelta
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
import keras.backend as K
from keras.models import load_model


from utils import plot_confusion_matrix, whiten, preprocess_instances
import time

%load_ext autoreload

%autoreload 2



rate = 4000.

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Loading image model

In [2]:
im_size = 224
channels=3
batch_size = 128
num_classes = 2
data_augmentation = True
target_shape = (1,224,224,3)

In [3]:
pretrained_model = VGG16(weights='imagenet', include_top=False, input_shape=(224,224,3), pooling = 'max')
for i in range(0, len(pretrained_model.layers)):
    if i < 15:
        pretrained_model.layers[i].trainable = False

firstnet = Sequential()

firstnet.add(pretrained_model)
firstnet.add(Dense(1024, activation= 'relu'))
firstnet.add(Dense(512, activation= 'relu'))
firstnet.add(Dropout(0.1))
firstnet.add(Dense(256, activation= 'relu'))
firstnet.add(Dense(128, activation= 'relu'))
firstnet.add(Dropout(0.1))
firstnet.add(Dense(64, activation= 'relu'))
firstnet.add(Dense(32, activation= 'relu'))
firstnet.add(Dropout(0.1))
firstnet.add(Dense(16, activation= 'relu'))
firstnet.add(Dense(8, activation= 'relu'))
firstnet.add(Dense(4, activation = 'relu'))
firstnet.add(Dense(2, activation= 'sigmoid'))

firstnet.load_weights('sex_weightsDUE2.h5')
firstnet.compile(loss='binary_crossentropy',
              optimizer=optimizers.Adam(lr=0.0001, decay= 0.0000001),
              metrics=['accuracy'])

In [4]:
pretrained_model2 = VGG16(weights='imagenet', include_top=False, input_shape=(224,224,3), pooling = 'max')
for i in range(0, len(pretrained_model2.layers)):
    if i < 15:
        pretrained_model2.layers[i].trainable = False

secondnet = Sequential()

secondnet.add(pretrained_model2)
secondnet.add(Dense(2048, activation= 'relu'))
secondnet.add(Dense(1024, activation= 'relu'))
secondnet.add(Dropout(0.1))
secondnet.add(Dense(512, activation= 'relu'))
secondnet.add(Dense(256, activation= 'relu'))
secondnet.add(Dropout(0.1))
secondnet.add(Dense(128, activation= 'relu'))
secondnet.add(Dense(64, activation= 'relu'))
secondnet.add(Dropout(0.1))
secondnet.add(Dense(32, activation= 'relu'))
secondnet.add(Dense(16, activation= 'relu'))
secondnet.add(Dense(4, activation = 'softmax'))

secondnet.load_weights('4th_tranche_best_model4classi.h5')
secondnet.compile(loss='categorical_crossentropy',
          optimizer=optimizers.SGD(lr=0.0001, momentum= 0.9, decay=0.000001),
          metrics=['accuracy'])

In [5]:
gender = ['Male', 'Female']
ethnicity = ['White', 'Black', 'Asian', 'Indian']

## Loading audio model

In [6]:
langs = [
        'German', 
        'English',
        'Spanish',
        'French', 
        'Italian',
        'Russian'
    ] 

In [7]:
def pick_random_step(voice, dt):
    start = np.random.randint(0, int(len(voice)-dt*rate))
    end = int(start + dt*rate)
    return voice[start:end]

In [8]:
def prepro_audio_setting(langs, dt):
    def prepro_audio_streaming(live_audio):   
        live_steps = np.array([pick_random_step(live_audio.astype(np.float), dt) for i in range(10)])
        prepro = preprocess_instances(1, whitening=True)
        X_norm = prepro(live_steps.reshape(-1, 16000, 1))
        return X_norm
    return prepro_audio_streaming

In [9]:
best_model_path = 'best_model_saved.hdf5'
best_model = load_model(best_model_path)

In [10]:
prepro_audio = prepro_audio_setting(langs, 4)

### Function for models inference

In [11]:
def process_frame(im, audio):
    img = im
    img =  np.asarray(img)
    img = cv2.resize(img, (224, 224))
    img = img/255.0
    try:
        y = firstnet.predict(img.reshape(target_shape))
        predict = np.argmax(y, axis=1)
        gen = gender[int(predict)]
        y2 = secondnet.predict(img.reshape(target_shape))
        predict2 = np.argmax(y2, axis=1)
        ethn = ethnicity[int(predict2)]
    except:
        ethn, gen = 'None', 'None'
    my_input = prepro_audio(audio[::2])
    my_preds = best_model.predict(my_input.reshape(-1,16000,1))
    pred = np.mean(my_preds, axis=0)
    lang_id = np.argmax(pred)
    my_lang = langs[lang_id]
    if np.median(np.abs(audio))< 0.005 or np.median(np.abs(audio)[-24000:])< 0.002:
        output_lang = 'Say something!'
    else:
        output_lang = my_lang
    return ethn, gen, output_lang

def write_frame(im, ethn, gen, lang):
    font = cv2.FONT_HERSHEY_SIMPLEX
    cv2.putText(im,gen,(0,50), font, 1,(0,0,0),2,cv2.LINE_AA)
    cv2.putText(im,ethn,(0,100), font, 1,(0,0,0),2,cv2.LINE_AA)
    cv2.putText(im,lang,(0,150), font, 1,(0,0,0),2,cv2.LINE_AA)
    return im

In [12]:
def live_testing():
    prepro_audio = prepro_audio_setting(langs, 4)
    myrecording = sd.rec(int(8.1*8000),samplerate=8000, channels=1)[:, 0]
    t_rec = time.time() 
    
    cap = cv2.VideoCapture(0)
    
    k=0
    while time.time() - t_rec < 5.1:
        k += 1
    ethnicity, gender = 'None', 'None'
    while True:
        r, frame = cap.read()
        if time.time() - t_rec > 5.1: 
            t_rec = time.time() 
            ethnicity, gender, lang = process_frame(frame, myrecording)
            myrecording = sd.rec(int(5.*8000),samplerate=8000, channels=1)[:, 0]
        
        frame = write_frame(frame, ethnicity, gender, lang)   
        cv2.imshow('Video', frame)

        
        if cv2.waitKey(20) & 0xFF == ord('q'):break
            
    cap.release()
    cv2.destroyAllWindows()

In [None]:
live_testing()

  r = func(a, **kwargs)
