# Imports e definições

In [1]:
import plotly.express as px
import plotly.graph_objects as go
# from sklearn.model_selection import train_test_split
# from tqdm import tqdm
# from torch.utils.data import Dataset, DataLoader
# import torch.optim as optim
# from torcheval.metrics.functional import multiclass_f1_score
# from torcheval.metrics.functional import binary_accuracy
# from torchinfo import summary

In [2]:
import numpy as np
import pandas as pd
from scipy import signal
from scipy.io import wavfile
from scipy.signal import find_peaks, peak_widths
import os
import pickle
import random
import torch
from torch import nn

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
# print(f"Using {device} device")

In [4]:
def LPC3janelas(s, fa):
    N = s.shape[0]
    if (N >= int(np.round(0.02*fa))) & (N <= int(np.round(0.2*fa))):
        Janela = int(np.round(N/3))
        marcas = [0, int(np.round((N-Janela)/2))-1, N-Janela]
        Ordem = int(np.round(0.003*fa))
        CP = np.zeros((Ordem,3))
        S = np.zeros((Janela-Ordem-1,Ordem+1))
        cont = 0
        for k in marcas:
            saux = s[k:k+Janela]
            for m in range(Ordem+1):
                S[:,m] = saux[m:m-Ordem-1]
            C = np.linalg.pinv(S[:,:-1]).dot(S[:,-1])
            CP[:,cont] = C
            cont += 1
        return CP
    else:
        return []

In [5]:
def CP2vec(CP, fa):
    w = np.arange(5000/fa*np.pi, 50/fa*np.pi, -100/fa*np.pi)
    P = np.zeros((len(w), CP.shape[1]))
    for k in range(CP.shape[1]):
        h = np.hstack((1, -np.flipud(CP[:,k])))
        for i in range(len(w)):
            P[i,k] = abs(1/np.sum(np.exp(-1j*w[i]*np.arange(len(h)))*h))
    P = np.log10(P/P.max() + 0.01) + 2
    P = P.flatten(order='F')/np.sqrt(np.sum(P.flatten()**2))
    return P

In [6]:
def lempelziv76(s):
	K = len(np.unique(s))
	N = len(s)
	L = 1
	dic = [s[0]]
	p = 1
	L = L+1
	while p+L < N:
		pos = ''.join(s[:p+L-1]).find(''.join(s[p:p+L]))
		if pos == -1:
			dic.append(''.join(s[p:p+L]))
			p = p+L
			L = 1
		else:
			L = L+1
	dic.append(''.join(s[p:]))
	# Complexidade de Lempel-Ziv, em bits por símbolo:
	#cLZ = length(dic)/C
	cLZ = len(dic)*(np.log2(len(dic))+1)/N
	return dic, cLZ

In [7]:
def perfEner(s, fa):
    N = s.shape[0]
    janela = np.round(0.1*fa).astype(int)
    passo = np.round(0.03*fa).astype(int)
    N2 = (N-janela)//passo
    E = np.zeros(N2)
    for i in range(N2):
        saux = s[i*passo:i*passo+janela]
        E[i] = (saux**2).sum()
    return E

In [8]:
def segmenta(E, pfala):
    segs = np.empty((2,0), dtype=int)
    qp = np.zeros(pfala.shape[1])
    for i in range(pfala.shape[1]):
        aux = E[pfala[0,i]:pfala[1,i]]
        peaks, _ = find_peaks(aux, distance=4)
        if peaks.size == 0: continue
        proe = signal.peak_prominences(aux/max(aux), peaks)[0]
        if np.any(proe/max(proe)<0.01): peaks = peaks[proe>0.01]
        _, _, ini, fim = peak_widths(aux, peaks, rel_height=0.7)
        ini = np.round(ini).astype(int)
        fim = np.round(fim).astype(int)
        # if np.any((fim-ini)<4):
        #     peaks = peaks[(fim-ini)>3]
        #     _, _, ini, fim = peak_widths(aux, peaks, rel_height=0.7)
        #     ini = np.round(ini).astype(int)
        #     fim = np.round(fim).astype(int)
        # inds = np.nonzero(fim[:-1]>ini[1:])[0]
        # fim[inds] = ini[inds+1]
        qp[i] = len(peaks)
        segs = np.hstack((segs, np.stack((ini,fim)) + pfala[0,i]))
    return segs, qp

In [9]:
def wav2ener2fon2(s, fa):
    E = perfEner(s, fa)
    pz = np.logical_and(E[1:] > E.max()/100, E[:-1] < E.max()/100)
    nz = np.logical_and(E[1:] < E.max()/100, E[:-1] > E.max()/100)
    pz = np.nonzero(pz)[0]
    nz = np.nonzero(nz)[0] + 1
    if nz[0]<=pz[0]: nz = nz[1:]
    if nz[-1]<=pz[-1]: pz = pz[:-1]
    pfala = np.stack((pz,nz))
    segs, _ = segmenta(E, pfala)
    P2 = np.zeros((segs.shape[1], 150))
    for i in range(segs.shape[1]):
        na = np.round(segs[0,i]*fa*0.03).astype(int)
        N = np.round((segs[1,i]-segs[0,i])*fa*0.03).astype(int)
        janela = int(np.round(N/3))
        marcas = [0, int(np.round((N-janela)/2))-1, N-janela]
        Ordem = int(np.round(0.003*fa))
        CP = np.zeros((Ordem,3))
        S = np.zeros((janela-Ordem-1,Ordem+1))
        cont = 0
        for k in marcas:
            saux = s[na+k:na+k+janela]
            for m in range(Ordem+1):
                S[:,m] = saux[m:m-Ordem-1]
            C = np.linalg.pinv(S[:,:-1]).dot(S[:,-1])
            CP[:,cont] = C
            cont += 1
        P2[i,] = CP2vec(CP, fa).transpose()
    x1 = torch.tensor(P2, dtype=torch.float32, device=device).detach()
    yp = model(x1)
    inds = yp.argmax(dim=1).cpu().detach().numpy()
    aux = F0.values[inds].flatten()
    fonemas = np.zeros(E.shape[0], dtype=np.str_)
    fonemas[:] = '0'
    for i in range(segs.shape[1]):
        fonemas[segs[0,i]:segs[1,i]] = aux[i]
    return fonemas

In [49]:
def filtrarSilencios(s, fa):
    E = perfEner(s, fa)
    passo = np.round(0.03*fa).astype(int)
    pz = np.logical_and(E[1:] > E.max()/100, E[:-1] < E.max()/100)
    nz = np.logical_and(E[1:] < E.max()/100, E[:-1] > E.max()/100)
    pz = np.nonzero(pz)[0]
    nz = np.nonzero(nz)[0] + 1
    if nz[0]<=pz[0]: nz = nz[1:]
    if nz[-1]<=pz[-1]: pz = pz[:-1]
    fmed = (nz-pz).mean()
    pausas = pz[1:] - nz[:-1]
    flags = np.ones(len(s), dtype=bool)
    for i in np.nonzero(pausas>fmed*2)[0]:
        ini = (nz[i]+2*int(fmed))*passo
        fin = int(pz[i+1])*passo
        flags[ini:fin] = False
    s = s[flags]
    return s

In [10]:
class mlpFon(nn.Module):
    def __init__(self, raw_dim, dim1, dim2):
        super().__init__()
        self.all_layers = torch.nn.Sequential(
            # Encoder
            nn.Flatten(start_dim=1),
            nn.Linear(raw_dim, dim1),
            nn.Tanh(),
            nn.Linear(dim1, dim2),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        logits = self.all_layers(x)
        return logits

In [50]:
class mlpLetras(nn.Module):
    def __init__(self, raw_dim, dim1, dim2):
        super().__init__()
        self.all_layers = torch.nn.Sequential(
            # Encoder
            nn.Flatten(start_dim=1),
            nn.Linear(raw_dim, dim1),
            nn.Tanh(),
            nn.Linear(dim1, dim2),
            nn.Sigmoid()
        )

    def forward(self, x):
        logits = self.all_layers(x)
        return logits

In [11]:
raw_dim = 150
dim1 = 150
dim2 = 174
modelFon = mlpFon(raw_dim, dim1, dim2).to(device)
modelFon.load_state_dict(torch.load('modeloMSE.pht'))
modelFon.eval()

mlpFon(
  (all_layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=150, out_features=150, bias=True)
    (2): Tanh()
    (3): Linear(in_features=150, out_features=174, bias=True)
    (4): Softmax(dim=1)
  )
)

In [51]:
raw_dim = 150
dim1 = 30
dim2 = 1
modelLet = mlpLetras(raw_dim, dim1, dim2).to(device)
modelLet.load_state_dict(torch.load('modeloLETRAS.pht'))
modelLet.eval()

mlpLetras(
  (all_layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=150, out_features=30, bias=True)
    (2): Tanh()
    (3): Linear(in_features=30, out_features=1, bias=True)
    (4): Sigmoid()
  )
)

In [12]:
with open('files.pkl', 'rb') as f:
    B, F = pickle.load(f)

In [13]:
with open('vects.pkl', 'rb') as f:
    vects, rotul = pickle.load(f)

# Linha de produção

In [14]:
path = 'C:\\Meu Drive\\Doutorado Unicamp\\Projeto\\github\\projeto-leitura\\Audios\\Fluente\\1 - Fluente.wav'
fs, s1 = wavfile.read(path)
print(fs, s1.shape[0]/fs)
path = 'C:\\Meu Drive\\Doutorado Unicamp\\Projeto\\github\\projeto-leitura\\Audios\\Silabou\\1-Silabou.wav'
fs, s2 = wavfile.read(path)
print(fs, s2.shape[0]/fs)
path = 'C:\\Meu Drive\\Doutorado Unicamp\\Projeto\\github\\projeto-leitura\\Audios\\Soletrou\\1 - Soletrou.wav'
fs, s3 = wavfile.read(path)
print(fs, s3.shape[0]/fs)

fa = 16000
s1 = signal.resample(s1, int(s1.shape[0]*fa/fs))
s2 = signal.resample(s2, int(s2.shape[0]*fa/fs))
s3 = signal.resample(s3, int(s3.shape[0]*fa/fs))

48000 55.14
48000 60.0
48000 59.7


In [46]:
s = filtrarSilencios(s3, fa)