In [1]:
import librosa
import IPython.display as ipd
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from glob import glob
from scipy import signal
import random
import cv2
import os

## Dataset

In [16]:
def load_audio(filename=None, sr=1600, second=3, samples=None):
    samples, sample_rate = librosa.load(filename, sr=sr)
    
    if second is not None and len(samples) < sr * second:
        samples = np.pad(samples, (0, sr * second - len(samples)), 'constant')
    if second is not None and len(samples) > sr * second:
        samples = samples[0:sr * second]
    return samples


def mel_feature(sample,normalization=False, resize=None):
    spectrogram = librosa.feature.melspectrogram(sample, sr=1600, n_mels=40, hop_length=160, n_fft=480, fmin=20, fmax=4000)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    if normalization:
        spectrogram = spectrogram.spectrogram()
        spectrogram -= spectrogram
    if resize:
        spectrogram = cv2.resize(spectrogram, (resize, resize))
    return spectrogram

"""

def plot_specgram(sample, window_size=20, step_size=10, eps=1e-10):
    def log_specgram(audio, sample_rate = 1600, window_size=20,
                     step_size=10, eps=1e-10):
        nperseg = int(round(window_size * sample_rate / 1e3))
        noverlap = int(round(step_size * sample_rate / 1e3))
        freqs, times, spec = signal.spectrogram(audio,
                                                fs=sample_rate,
                                                window='hann',
                                                nperseg=nperseg,
                                                noverlap=noverlap
                                                )
        return freqs, times, np.log(spec.T.astype(np.float32) + eps)

    freqs, times, spectrogram = log_specgram(sample)

    fig = plt.figure(figsize=(14, 8))
    ax1 = fig.add_subplot(211)
    ax1.set_title('Raw wave')
    ax1.set_ylabel('Amplitude')
    ax1.plot(np.linspace(0, self.sr / len(self.samples), self.sr), self.samples)

    ax2 = fig.add_subplot(212)
    ax2.imshow(spectrogram.T, aspect='auto', origin='lower',
               extent=[times.min(), times.max(), freqs.min(), freqs.max()])
    ax2.set_yticks(freqs[::16])
    ax2.set_xticks(times[::16])
    ax2.set_title('Spectrogram')
    ax2.set_ylabel('Freqs in Hz')
    ax2.set_xlabel('Seconds')
"""

def play(sample):
    return ipd.Audio(sample, 1600, autoplay=True)

def timeshift(wav, sr=1600, ms=100):
    shift = (sr * ms) // 1000
    shift = random.randint(-shift, shift)
    a = -min(0, shift)
    b = max(0, shift)
    data = np.pad(wav, (a, b), "constant")
    return data[:len(data) - a] if a else data[b:]


In [3]:
files = os.listdir('./voice/')
audios = [(load_audio('./voice/' + f, 1600), f[:-5]) for f in files]


In [15]:
audios[1][0]

array([ 0.00225449,  0.00589441,  0.00647261, ..., -0.00945333,
       -0.00452255,  0.00032612], dtype=float32)

In [21]:
mel_features = [(mel_feature(audio[0]),audio[1]) for audio in audios]
#plot_specgram(audio)



In [25]:
mel_features[2][0].shape

(40, 31)

In [29]:
a = mel_features[1][0]
b = mel_features[2][0]
c = mel_features[8][0]
dist1 = np.linalg.norm(a-b)
dist2 = np.linalg.norm(a-c)
print(dist1)
print(dist2)

606.21063
291.127


In [32]:
print(((a - b) * (a - b)).sum())
print(((a - c) * (a - c)).sum())

367491.3
84754.97


## 语谱图

In [None]:
librosa.feature.melspectrogram(y=y, sr=sr)
librosa.display.specshow(librosa.power_to_db(librosa.stft(y),ref=np.max), y_axis='mel', x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()
plt.show()

## 录音/保存

In [None]:
import pyaudio
import wave
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 3  #设置录音的时间长度
WAVE_OUTPUT_FILENAME = "./voice/thinking5.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)
print("* recording")
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)
print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

## 画FFT图

In [None]:
from tkinter import *
import wave
from scipy.fftpack import fft,ifft
import matplotlib.pyplot as plt
import numpy as np
 
def data_fft(data, time, time_start, time_end):
        #短时fft。截取一段时间内的数据先
        #time_start是开始时间，time_end是结束时间
        t = []
        y = []
        count = 0
        #for i in time:
        for i in range(time.size):
                if((time[i] >= time_start) & (time[i] <= time_end)):
                        count = count + 1
                        t = np.append(t, time[i])
                        y = np.append(y, data[0][i])    #只提取左声道
        #print (count)
                        
        yy=fft(y)                  #快速傅里叶变换
        yreal = yy.real               # 获取实数部分
        yimag = yy.imag               # 获取虚数部分
 
                
        yf=abs(fft(y))                # 取绝对值
        yf1=abs(fft(y))/len(t)           #归一化处理
        yf2 = yf1[range(int(len(t)/2))]  #由于对称性，只取一半区间
 
        xf = np.arange(len(y))        # 频率
        xf1 = xf
        xf2 = xf[range(int(len(t)/2))]  #取一半区间
 
        #plt.figure()
        """
        
        plt.subplot(221)
        plt.plot(t, y)   
        plt.title('Original wave')
 
        plt.subplot(222)
        plt.plot(xf,yf,'r')
        plt.title('FFT of Mixed wave(two sides frequency range)',fontsize=7,color='#7A378B')  #注意这里的颜色可以查询颜色代码表
 
        plt.subplot(223)
        plt.plot(xf1,yf1,'g')
        plt.title('FFT of Mixed wave(normalization)',fontsize=9,color='r')
 
        plt.subplot(224)
        plt.plot(xf2,yf2,'b')
        plt.title('FFT of Mixed wave)',fontsize=10,color='#F08080')
        
        """
        plt.plot(t, y)   
        plt.title('Original wave')
 
 
        plt.show()
        
        
def main():
	wave_data, time = read_wave_data('./voice/angry1.wav')
	
	data_fft(wave_data, time, 0, 2)
	

main()