In [37]:
import os
import numpy as np
from scipy.io import wavfile
from python_speech_features import mfcc
import librosa
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean


data_dir = 'data' 

digits = ['0', '1', '2','3','4','5','6','7','8','9']  
operators = ['plus','minus','multiply'] 

audio_data = {} 

# Process digit files
for digit in digits:
    digit_path = os.path.join(data_dir, 'digits', digit)
    audio_files = os.listdir(digit_path)
    digit_features = []
    for audio_file in audio_files:
        audio_path = os.path.join(digit_path, audio_file)
        _, audio = wavfile.read(audio_path)  
        audio = audio.astype(float)
        mfcc_features = mfcc(audio) 
        digit_features.append(mfcc_features)
    audio_data[digit] = digit_features

# Process operator files
for operator in operators:
    operator_path = os.path.join(data_dir, 'operators', operator)
    audio_files = os.listdir(operator_path)
    operator_features = []
    for audio_file in audio_files:
        audio_path = os.path.join(operator_path, audio_file)
        _, audio = wavfile.read(audio_path)
        audio = audio.astype(float)
        mfcc_features = mfcc(audio) 
        operator_features.append(mfcc_features)
    audio_data[operator] = operator_features



In [2]:
audio_data

{'0': [array([[-3.60436534e+01,  0.00000000e+00,  0.00000000e+00, ...,
           6.74820084e-14,  0.00000000e+00,  3.50716881e-14],
         [-3.60436534e+01,  0.00000000e+00,  0.00000000e+00, ...,
           6.74820084e-14,  0.00000000e+00,  3.50716881e-14],
         [-3.60436534e+01,  0.00000000e+00,  0.00000000e+00, ...,
           6.74820084e-14,  0.00000000e+00,  3.50716881e-14],
         ...,
         [ 6.99002658e+00, -6.45910447e+00, -5.95592384e+00, ...,
           1.80692008e+01,  1.36503023e+00, -2.73777583e+00],
         [ 6.80995878e+00, -7.72330755e+00, -8.20001681e+00, ...,
           9.11799889e+00,  2.88767345e+00,  6.43496267e-01],
         [ 6.58270626e+00, -1.14270206e+01, -1.34104159e+01, ...,
           1.49708411e+01,  8.25923811e+00,  3.95900827e+00]]),
  array([[-3.60436534e+01,  0.00000000e+00,  0.00000000e+00, ...,
           6.74820084e-14,  0.00000000e+00,  3.50716881e-14],
         [-3.60436534e+01,  0.00000000e+00,  0.00000000e+00, ...,
           6.7482

In [3]:
def dtw_distance(seq1, seq2):
    distance, path = fastdtw(seq1, seq2, dist=euclidean)
    return distance

In [4]:

def predict_spoken_word(input_audio):
    input_audio = input_audio.astype(float)
    input_mfcc = mfcc(input_audio)  
    min_distance = float('inf')
    predicted_word = None

    for digit in audio_data.keys():
        for digit_features in audio_data[digit]:
            distance, _ = fastdtw(input_mfcc, digit_features, dist=euclidean)
            if distance < min_distance:
                min_distance = distance
                predicted_word = digit
    return predicted_word


In [30]:
def predict(audio_path):
    audio, sr = librosa.load(audio_path, sr=None)
    
    speech_segments = librosa.effects.split(audio, top_db=30)

    labels = []

    MIN_SEGMENT_DURATION = 0.2  # Define the minimum segment duration in seconds

    # Sort the speech segments based on their duration
    sorted_segments = sorted(speech_segments, key=lambda x: x[1] - x[0], reverse=True)

    # Extract and save the three longest speech segments separately while preserving order
    for i, segment in enumerate(speech_segments):
        if any((segment == s).all() for s in sorted_segments[:3]):
            # Check if the segment has enough duration (ignore silence)
            segment_duration = librosa.get_duration(y=audio[segment[0]:segment[1]], sr=sr)
            if segment_duration >= MIN_SEGMENT_DURATION:
                segment_audio = audio[segment[0]:segment[1]]
                segment_path = f'segment_{i}.wav'
                wavfile.write(segment_path, sr, segment_audio)
                _, test_audio = wavfile.read(segment_path)
                label = predict_spoken_word(test_audio)
                labels.append(label)
    return labels


In [9]:
x= predict('1+1.wav')
calculate_result(x)

2

In [10]:
x=predict('1+2.wav')
calculate_result(x)

3

In [11]:
x=predict('2+2.wav')
calculate_result(x)

4

In [30]:
x=predict('4mult2.wav')
calculate_result(x)


['4', 'multiply', '2']

In [14]:
x=predict('9+7.wav')
calculate_result(x)

16

In [15]:
x=predict('9-7.wav')
calculate_result(x)

2

In [16]:
x=predict('5-1.wav')
calculate_result(x)

4

In [17]:
x=predict('7+2.wav')
calculate_result(x)

9

In [19]:
x=predict('6mult5.wav')
calculate_result(x)

30

In [20]:
x=predict('8+9.wav')
calculate_result(x)

17

In [21]:
x=predict('8+3.wav')
calculate_result(x)

11

In [22]:
x=predict('3-1.wav')
calculate_result(x)

2

In [25]:
x=predict('1+7.wav')
calculate_result(x)

8

In [40]:
x=predict('4-2.wav')
calculate_result(x)


2

In [38]:
x=predict('6-5.wav')
calculate_result(x)


1

In [27]:
x= predict('Speech1+1.wav')
calculate_result(x)

2

In [31]:
x =predict('3mult4.wav')
calculate_result(x)

12

In [32]:
x

['3', 'multiply', '4']

In [33]:
x= predict('4mult3.wav')
calculate_result(x)

In [34]:
x

['4', '4', '3']

In [6]:
def calculate_result(expression):
    result = None
    if expression[1] == 'plus':
        result = int(expression[0])+int(expression[2])
    elif expression[1] == 'minus':
        result = int(expression[0])-int(expression[2])
    elif expression[1] == 'multiply':
        result = int(expression[0])*int(expression[2])
            
    return result


In [38]:
def load_file():
    # Open a file dialog to choose the WAV file
    file_path = filedialog.askopenfilename(filetypes=[("WAV files", "*.wav")])

    # Start a new thread to calculate the result
    thread = threading.Thread(target=calculate_result_thread, args=(file_path,))
    thread.start()

    # Update the progress bar in the main thread
    progress_bar.start(10)  # Start the progress bar animation


In [39]:
import tkinter as tk
from tkinter import filedialog
import threading
from tqdm import tqdm
import time
from tkinter import ttk

# Create the main application window
window = tk.Tk()
window.title("Voice Calculator")
window.configure(bg="white")

# Set the window size
window.geometry("400x300")  # Adjust the width and height as desired

# Create a title label in the center
title_label = tk.Label(window, text="Voice Calculator", font=("Arial", 20), bg="white", fg="black")
title_label.pack(pady=20)

# Create a button to load the WAV file
load_button = tk.Button(window, text="Load WAV", command=load_file, font=("Arial", 14), bg="gray", fg="white")
load_button.pack(pady=20)

# Create a label to display the expression
expression_label = tk.Label(window, text="Expression:", font=("Arial", 16), bg="white", fg="black")
expression_label.pack()

# Create a label to display the result
result_label = tk.Label(window, text="Result:", font=("Arial", 16), bg="white", fg="black")
result_label.pack()

# Create a progress bar
progress_bar = ttk.Progressbar(window, mode='determinate', length=200)
progress_bar.pack(pady=10)

# Function to load the WAV file and process it

# Function to calculate the result based on the expression
def calculate_result_thread(file_path):
    # Simulate a time-consuming calculation
    time.sleep(3)

    # Call the calculate_result() function to predict the result
    expression = predict(file_path)
    result = calculate_result(expression)

    # Display the expression and result in the labels
    expression_label.configure(text="Expression: " + expression[0] + expression[1] + expression[2])
    result_label.configure(text="Result: " + str(result))

    # Stop the progress bar animation
    progress_bar.stop()

# Run the main event loop
window.mainloop()
