In [1]:
import numpy as np
import pandas as pd
import scipy.io.wavfile as wavfile
import os
from sklearn import linear_model

In [2]:
#
# We will keep in 1/4th or 25% of the original audio file. The remaining 
# percent of the file will be generated via linear extrapolation.
#
Provided_Portion = 0.25



# List to append all the audio files
zero = []


#
# Looping through the dataset and loading up all 50 of the 0_jackson*.wav
# files 
# .read() returns a tuple 

for filename in os.listdir('free-spoken-digit-dataset-master/recordings'):
    if filename.startswith('0_jackson'):
        sample = os.path.join('free-spoken-digit-dataset-master/recordings', filename)
        sample_rate, audio_data = wavfile.read(sample)
        zero.append(audio_data)
# Understanding what zero looks like
print("Contents of list zero: ",zero)

Contents of list zero:  [array([-369, -431, -475, ...,  301,  324,  304], dtype=int16), array([-311,  -91, -140, ...,  378,  357,  333], dtype=int16), array([-314, -303, -332, ..., -355, -343, -322], dtype=int16), array([347, 351, 462, ..., 365, 338, 302], dtype=int16), array([-336,  160,   65, ..., -315, -343, -319], dtype=int16), array([ 354,  442,  610, ..., -312, -336, -333], dtype=int16), array([ 397,  531,  638, ..., -357, -386, -353], dtype=int16), array([ 382,  459,  530, ..., -254, -301, -309], dtype=int16), array([-393,   54,  -71, ...,  319,  340,  313], dtype=int16), array([-311, -363, -318, ..., -239, -305, -304], dtype=int16), array([-316, -336, -342, ..., -442, -424, -310], dtype=int16), array([ 335,  392,  481, ..., -302, -314, -304], dtype=int16), array([-361, -226, -238, ..., -286, -311, -343], dtype=int16), array([-309, -323, -333, ..., -301, -300, -304], dtype=int16), array([ 305,  305,  294, ..., -379, -342, -300], dtype=int16), array([ 342,  452,  546, ..., -356, 

In [3]:
print("Number of audio files in list zero: ", len(zero))
for i in range(len(zero)):
    print("Length of audio clip ",i," :" ,len(zero[i]))

Number of audio files in list zero:  50
Length of audio clip  0  : 5148
Length of audio clip  1  : 4261
Length of audio clip  2  : 5451
Length of audio clip  3  : 4914
Length of audio clip  4  : 4087
Length of audio clip  5  : 4716
Length of audio clip  6  : 4982
Length of audio clip  7  : 5110
Length of audio clip  8  : 4423
Length of audio clip  9  : 4237
Length of audio clip  10  : 5235
Length of audio clip  11  : 5103
Length of audio clip  12  : 4257
Length of audio clip  13  : 4970
Length of audio clip  14  : 4797
Length of audio clip  15  : 4826
Length of audio clip  16  : 4720
Length of audio clip  17  : 5136
Length of audio clip  18  : 4942
Length of audio clip  19  : 4663
Length of audio clip  20  : 5165
Length of audio clip  21  : 5144
Length of audio clip  22  : 4621
Length of audio clip  23  : 4788
Length of audio clip  24  : 5120
Length of audio clip  25  : 4571
Length of audio clip  26  : 4939
Length of audio clip  27  : 5318
Length of audio clip  28  : 5266
Length of aud

Since the lengths are different, we will hard chop all audio clips to be of the same length.

In [4]:
# Dropping all Nans on the Y axis here and converting the dataset into an
# NDArray 
zero = pd.DataFrame(data = zero, dtype = np.int16)
zero.dropna(axis=1, inplace = True)
zero = zero.values

# 'zero' is currently shaped [n_samples, n_audio_samples],
n_audio_samples = zero.shape[1]
print(n_audio_samples)
print(zero.shape)

4087
(50, 4087)


In [5]:
#
# Creating the linear regression model
model = linear_model.LinearRegression()

from sklearn.utils.validation import check_random_state
rng   = check_random_state(7)  
random_idx = rng.randint(zero.shape[0])
test  = zero[random_idx]
train = np.delete(zero, [random_idx], axis=0)

# 
# Printing out the shape of train, and the shape of test
print(train.shape, test.shape)

(49, 4087) (4087,)


In [6]:
#
# Saving the original 'test' clip part 
wavfile.write('Original Test Clip.wav', sample_rate, test)

In [7]:
# Grabbing the FIRST Provided_Portion * n_audio_samples audio features 
# from test
X_test = test[:int(Provided_Portion*n_audio_samples)]

# Grabbing the *remaining* audio features and storing it in y_test.
y_test = test[int(Provided_Portion*n_audio_samples):]


# 
# Duplicating the same above process for X_train, y_train.
X_train = train[:,:int(Provided_Portion*n_audio_samples)]
y_train = train[:,int(Provided_Portion*n_audio_samples):]


# .reshape(1, -1) turns [n_features] into [1, n_features].
# .reshape(-1, 1) turns [n_samples] into [n_samples, 1].
X_test = X_test.reshape(1,-1)
y_test = y_test.reshape(1,-1)

In [8]:
#
# Fitting model using training data and label
model.fit(X_train, y_train)

# 
# Using the model to predict the 'label' of X_test. 
y_test_prediction = model.predict(X_test)


y_test_prediction = y_test_prediction.astype(dtype=np.int16)


# Checking the accuracy score
score = model.score(X_test, y_test)
print ("Extrapolation R^2 Score: ", score)


#
# Taking the first Provided_Portion portion of the test clip and stitching that
# together with the abomination the predictor model generated
# and then saving the completed audio clip
completed_clip = np.hstack((X_test, y_test_prediction))
wavfile.write('Extrapolated Clip.wav', sample_rate, completed_clip[0])

Extrapolation R^2 Score:  0.0
