In [1]:
import torch
import speech_recognition as sr
from time import time

In [2]:
# # 📦 Step 1: Import dependencies
# from pydub import AudioSegment
# from pydub.utils import which
# from pathlib import Path

# # 📍 Step 2: Check if FFmpeg is available
# ffmpeg_path = which("ffmpeg")
# if ffmpeg_path is None:
#     print("❌ FFmpeg not found. Please install it and add it to PATH.")
# else:
#     print(f"✅ FFmpeg found at: {ffmpeg_path}")

# # 🛠️ Step 3: Conversion function
# def convert_mp3_to_wav(folder_path):
#     folder = Path(folder_path)
#     mp3_files = list(folder.glob("*.mp3")) + list(folder.glob("*.MP3"))

#     if not mp3_files:
#         print(f"❌ No .mp3 files found in: {folder}")
#         return

#     for mp3_file in mp3_files:
#         print(f"🎧 Converting: {mp3_file.name}")
#         try:
#             wav_file = mp3_file.with_suffix(".wav")
#             audio = AudioSegment.from_mp3(mp3_file)
#             audio = audio.set_channels(1).set_frame_rate(16000)
#             audio.export(wav_file, format="wav")
#             print(f"✅ Converted: {mp3_file.name} → {wav_file.name}")
#         except Exception as e:
#             print(f"⚠️ Error converting {mp3_file.name}: {e}")

# # 🚀 Step 4: Run it on your specific folder
# convert_mp3_to_wav(r"C:\Users\admin\Desktop\BEA-Capstone\data\speech_wpm")

In [3]:
# Transcribe & Compute WPM
import speech_recognition as sr
import pandas as pd
from pathlib import Path

def process_audio_folder(folder_path):
    recognizer = sr.Recognizer()
    results = []
    folder = Path(folder_path)

    wav_files = list(folder.glob("*.wav"))
    print(f"🔍 Found {len(wav_files)} .wav files in: {folder.resolve()}")

    for wav_file in wav_files:
        print(f"🎧 Processing: {wav_file.name}")
        try:
            with sr.AudioFile(str(wav_file)) as source:
                audio = recognizer.record(source)

                # Calculate duration manually (fallback for missing .DURATION)
                duration_sec = len(audio.frame_data) / (audio.sample_rate * audio.sample_width)
                print(f"⏱️ Duration: {duration_sec:.2f} seconds")

            # Taglish transcription using Google Speech API
            text = recognizer.recognize_google(audio, language="en-PH")
            word_count = len(text.split())
            wpm = word_count / (duration_sec / 60)

            results.append({
                "filename": wav_file.name,
                "transcription": text,
                "duration_sec": duration_sec,
                "word_count": word_count,
                "wpm": wpm,
                "label": None  # optional for later ADHD labeling
            })

            print(f"✅ WPM: {wpm:.2f} — {text}")

        except sr.UnknownValueError:
            print(f"❌ No speech recognized in {wav_file.name}")
        except sr.RequestError as e:
            print(f"❌ Google API Error: {e}")
        except Exception as e:
            print(f"⚠️ Error with {wav_file.name}: {e}")

    # Save to CSV
    df = pd.DataFrame(results)
    print(f"\n🧾 Transcribed {len(df)} files.")
    df.to_csv("rapid_talking_data.csv", index=False)
    return df

df = process_audio_folder(r"C:\Users\admin\Desktop\BEA-Capstone\data\speech_wpm")

🔍 Found 163 .wav files in: C:\Users\admin\Desktop\BEA-Capstone\data\speech_wpm
🎧 Processing: r1.wav
⏱️ Duration: 4.09 seconds
✅ WPM: 146.82 — ma tapos na ako oh tignan mo ang ganda diba
🎧 Processing: r10.wav
⏱️ Duration: 3.13 seconds
✅ WPM: 133.98 — kuroko drawing ko ito akong gumawa lahat
🎧 Processing: r100.wav
⏱️ Duration: 3.95 seconds
✅ WPM: 91.22 — bakit siya may sticker ako wala
🎧 Processing: r101.wav
⏱️ Duration: 3.56 seconds
✅ WPM: 84.21 — nakakatakot yung tunog ano yun
🎧 Processing: r102.wav
⏱️ Duration: 3.52 seconds
✅ WPM: 102.27 — naubos ko na pwede dagdag ulit
🎧 Processing: r103.wav
⏱️ Duration: 4.33 seconds
✅ WPM: 138.55 — huwag mo gagalaw niyan ah inayos ko na yan eh
🎧 Processing: r104.wav
⏱️ Duration: 3.82 seconds
✅ WPM: 78.56 — kanina tong ballpen pwede pahiram
🎧 Processing: r105.wav
⏱️ Duration: 3.97 seconds
✅ WPM: 75.60 — ang tagal mag-load ayoko na
🎧 Processing: r106.wav
⏱️ Duration: 3.69 seconds
✅ WPM: 113.80 — gusto ko upo sa lap mo ma
🎧 Processing: r107.wav
⏱️ Dura

In [4]:
df = pd.read_csv("rapid_talking_data.csv")
df["label"] = [1 if wpm > 100 else 0 for wpm in df["wpm"]]
df["label"] = df["label"].astype(int)

print(f"DataFrame shape: {df.shape}")
print(df.head())
print(df.isna().sum()) 

DataFrame shape: (159, 6)
   filename                                transcription  duration_sec  \
0    r1.wav  ma tapos na ako oh tignan mo ang ganda diba      4.086750   
1   r10.wav     kuroko drawing ko ito akong gumawa lahat      3.134688   
2  r100.wav              bakit siya may sticker ako wala      3.946687   
3  r101.wav               nakakatakot yung tunog ano yun      3.562688   
4  r102.wav               naubos ko na pwede dagdag ulit      3.520000   

   word_count         wpm  label  
0          10  146.815930      1  
1           7  133.984648      1  
2           6   91.215735      0  
3           5   84.206094      0  
4           6  102.272727      1  
filename         0
transcription    0
duration_sec     0
word_count       0
wpm              0
label            0
dtype: int64


In [5]:
#Creating Dataset and Model

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

class WPMSpeechDataset(Dataset):
    def __init__(self, df):
        self.data = torch.tensor(df["wpm"].values, dtype=torch.float32).view(-1, 1, 1)
        self.labels = torch.tensor(df["label"].values, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

dataset = WPMSpeechDataset(df)
loader = DataLoader(dataset, batch_size=8, shuffle=True)

class WPMLSTM(nn.Module):
    def __init__(self):
        super(WPMLSTM, self).__init__()
        self.lstm = nn.LSTM(1, 16, batch_first=True)
        self.fc = nn.Linear(16, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        return torch.sigmoid(self.fc(out[:, -1, :]))

model = WPMLSTM()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [6]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Subset

# Split dataset into training and validation
train_idx, val_idx = train_test_split(range(len(dataset)), test_size=0.2, shuffle=True, random_state=42)
train_loader = DataLoader(Subset(dataset, train_idx), batch_size=8, shuffle=True)
val_loader = DataLoader(Subset(dataset, val_idx), batch_size=8)

# Updated Training Loop
for epoch in range(20):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for X, y in train_loader:
        optimizer.zero_grad()
        pred = model(X)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Accuracy calculation
        predicted = (pred > 0.5).float()
        correct += (predicted == y).sum().item()
        total += y.size(0)

    acc = correct / total

    # Validation loop
    model.eval()
    val_correct = 0
    val_total = 0
    val_loss = 0

    with torch.no_grad():
        for X_val, y_val in val_loader:
            val_pred = model(X_val)
            val_loss += criterion(val_pred, y_val).item()
            val_predicted = (val_pred > 0.5).float()
            val_correct += (val_predicted == y_val).sum().item()
            val_total += y_val.size(0)

    val_acc = val_correct / val_total
    val_loss /= len(val_loader)

    print(f"Epoch {epoch+1}: "
          f"Train Loss = {total_loss:.4f}, Train Acc = {acc:.4f}, "
          f"Val Loss = {val_loss:.4f}, Val Acc = {val_acc:.4f}")


Epoch 1: Train Loss = 11.1311, Train Acc = 0.4803, Val Loss = 0.6700, Val Acc = 0.9688
Epoch 2: Train Loss = 10.5102, Train Acc = 0.7402, Val Loss = 0.6447, Val Acc = 0.6562
Epoch 3: Train Loss = 10.3916, Train Acc = 0.6929, Val Loss = 0.6487, Val Acc = 0.8438
Epoch 4: Train Loss = 10.0558, Train Acc = 0.7480, Val Loss = 0.6174, Val Acc = 0.7812
Epoch 5: Train Loss = 10.0743, Train Acc = 0.6693, Val Loss = 0.6656, Val Acc = 0.4062
Epoch 6: Train Loss = 9.6919, Train Acc = 0.7244, Val Loss = 0.5872, Val Acc = 0.8750
Epoch 7: Train Loss = 9.9607, Train Acc = 0.6378, Val Loss = 0.6098, Val Acc = 0.6562
Epoch 8: Train Loss = 9.4545, Train Acc = 0.6772, Val Loss = 0.6717, Val Acc = 0.4062
Epoch 9: Train Loss = 9.2919, Train Acc = 0.7087, Val Loss = 0.6440, Val Acc = 0.4688
Epoch 10: Train Loss = 8.6363, Train Acc = 0.7795, Val Loss = 0.5333, Val Acc = 0.8750
Epoch 11: Train Loss = 8.3604, Train Acc = 0.7795, Val Loss = 0.5194, Val Acc = 0.7812
Epoch 12: Train Loss = 7.9200, Train Acc = 0.87

In [7]:
# Save Model
torch.save(model.state_dict(), "taglish_wpm_model.pth")

In [12]:
import speech_recognition as sr
import time

def classify_by_wpm(wpm):
    if wpm < 150:
        return "Least likely to have ADHD"
    elif 150 <= wpm <= 200:
        return "Moderately likely to have ADHD"
    else:
        return "Most likely to have ADHD"

r = sr.Recognizer()
session_text = ""
session_start = time.time()
max_session_duration = 5 * 60  # 5 minutes

with sr.Microphone() as source:
    print("🎙️ Start speaking (5-minute session)...")
    r.adjust_for_ambient_noise(source)

    try:
        while True:
            if time.time() - session_start > max_session_duration:
                break

            print("🎧 Listening for up to 15 seconds...")
            try:
                audio = r.listen(source, timeout=5, phrase_time_limit=15)
                text = r.recognize_google(audio, language="tl-PH")
                session_text += text + " "
                print("✅ Chunk Transcription:", text)
            except sr.WaitTimeoutError:
                print("⏳ Silence detected. Skipping chunk.")
            except sr.UnknownValueError:
                print("⚠️ Couldn't understand that chunk.")
            except sr.RequestError as e:
                print(f"❌ API error: {e}")
                break

    except KeyboardInterrupt:
        print("\n🛑 Manually stopped.")

# 📊 After session ends
end = time.time()
duration = end - session_start
duration_min = duration / 60
word_count = len(session_text.split())
wpm = word_count / duration_min if duration_min > 0 else 0
adhd_result = classify_by_wpm(wpm)

# ✅ Summary Output
print("\n📋 Final Summary:")
print("📄 Full Transcription:\n")
print(session_text.strip())

print("\n📊 Session Stats:")
print(f"⏱️ Duration: {duration:.2f} seconds")
print(f"🔢 Word Count: {word_count}")
print(f"🧮 Words Per Minute (WPM): {wpm:.2f}")
print(f"🧠 ADHD Likelihood: {adhd_result}")

🎙️ Start speaking (5-minute session)...
🎧 Listening for up to 15 seconds...
✅ Chunk Transcription: ang konehot ang pagong Diabetes and the Turtle What's up panatimes Sa isang tahimik na kagubatan ay may dalawang
🎧 Listening for up to 15 seconds...
✅ Chunk Transcription: mabagal ngunit pa tiyaga sa isang araw nag-yabang sa koneho wala pang Mga talo sa akin sa karera
🎧 Listening for up to 15 seconds...
✅ Chunk Transcription: traffic ngumiti lang si pagong
🎧 Listening for up to 15 seconds...
✅ Chunk Transcription: magkaroon na tayo buk
🎧 Listening for up to 15 seconds...
✅ Chunk Transcription: ikoneho ang hamon kinabukasan nagsimula ang
🎧 Listening for up to 15 seconds...
✅ Chunk Transcription: tobilis na mabilis habang sa pagong ay dahan-dahan lang ang bagal niya Sabi ni koneho Matutulog muna ako
🎧 Listening for up to 15 seconds...
✅ Chunk Transcription: katulog habang si Kuneho tuloy-tuloy lang
🎧 Listening for up to 15 seconds...
✅ Chunk Transcription: habang tulog si koneho tuloy-tuloy