In [2]:
# Connect to gdrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import Library
import os
import sys
import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt
import seaborn as sns

import librosa
import soundfile as sf
from scipy.signal import lfilter
from copy import deepcopy
from tqdm import tqdm

sys.path.append('/content/drive/My Drive/[3] Model ML TBCare/')
import speechproc

# **Metadata Exploration**

## **Load Metadata**

In [4]:
df = pd.read_csv("/content/drive/My Drive/[3] Model ML TBCare/Dataset Google + CIDRZ Health AI Evaluation Zambia/Metadata and Codebook/GHAI_Final_Data_2023.csv")
df

Unnamed: 0,sex,consent_obtained,barcode,coughdur,cough_productive,haemoptysis,chestpain,shortbreath,fever,ngtsweats,...,tb_class_index,tb_predictions,tb_predictions1,age,meters_height,bmi,smear_all,type_tb,ground_truth_tb,facility_code
0,m,1,01-399-0258,1-2wks,yes,no,yes,yes,no,yes,...,1,0.654549,0.345451,31,1.62,24.0,neg,No TBdx,neg,Kan
1,m,1,01-399-1081,<1wk,yes,no,yes,yes,yes,yes,...,1,0.438112,0.561888,58,1.64,14.0,neg,No TBdx,neg,Kan
2,m,1,02-399-0856,<1wk,yes,yes,yes,yes,yes,yes,...,1,0.414009,0.585991,24,1.83,16.0,neg,No TBdx,neg,Cha
3,f,1,03-399-0290,1-2wks,yes,no,yes,no,no,yes,...,1,0.607053,0.392947,25,1.70,21.0,neg,No TBdx,neg,Chai
4,m,1,01-399-0744,<1wk,yes,yes,yes,yes,yes,yes,...,1,0.401624,0.598376,30,1.70,20.0,neg,No TBdx,neg,Kan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1823,f,1,02-399-0239,no cough,,,no,no,no,no,...,1,0.846709,0.153291,24,1.58,26.0,neg,No TBdx,neg,Cha
1824,f,1,02-399-0214,<1wk,yes,no,yes,yes,yes,no,...,1,0.842927,0.157073,21,1.56,22.0,neg,No TBdx,neg,Cha
1825,m,1,01-399-0973,<1wk,yes,no,yes,no,yes,yes,...,1,0.783166,0.216834,47,1.70,21.0,neg,No TBdx,neg,Kan
1826,m,1,02-399-0711,>4weeks,yes,yes,yes,yes,yes,no,...,1,0.764668,0.235332,23,1.65,20.0,neg,No TBdx,neg,Cha


In [5]:
count_values = df["ground_truth_tb"].value_counts()
print(count_values)

ground_truth_tb
neg    1623
pos     205
Name: count, dtype: int64


# **Pre-Processing**

## **Segmentasi**

In [None]:
# Chainda South
# === Konfigurasi path ===
BASE_PATH = "/content/drive/My Drive/[3] Model ML TBCare/Dataset Google + CIDRZ Health AI Evaluation Zambia"
METADATA_PATH = os.path.join(BASE_PATH, "Metadata and Codebook/GHAI_Final_Data_2023.csv")

INPUT_LABELS = {
    "Audio-Recorder-Chainda-South": "Audio Rec",
    "Chainda South Phone A": "Phone A",
    "Chainda South Phone B": "Phone B",
    "Chainda South Phone C": "Phone C"
}
OUTPUT_ROOT = "1_Chainda South Segmented"

# Buat folder output jika belum ada
full_output_path = os.path.join(BASE_PATH, OUTPUT_ROOT)
if not os.path.exists(full_output_path):
    os.makedirs(full_output_path)

for label in INPUT_LABELS.values():
    os.makedirs(os.path.join(full_output_path, label, "pos"), exist_ok=True)
    os.makedirs(os.path.join(full_output_path, label, "neg"), exist_ok=True)

# Muat metadata dan buat mapping
try:
    metadata_df = pd.read_csv(METADATA_PATH)
    # Buat dictionary untuk mapping barcode ke ground_truth_tb
    id_to_tb_label = dict(zip(metadata_df['barcode'], metadata_df['ground_truth_tb']))
    print("✅ Metadata berhasil dimuat.")
except FileNotFoundError:
    print(f"❌ Error: File metadata tidak ditemukan di {METADATA_PATH}")
    id_to_tb_label = {}

# === Fungsi VAD ===
def getVad(data, fs):
    winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512
    ftThres = 0.5
    vadThres = 0.4
    opts = 1

    ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt)
    pv01 = np.zeros(nfr10)
    pv01[np.less_equal(ft, ftThres)] = 1
    pitch = deepcopy(ft)
    pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts)

    ENERGYFLOOR = np.exp(-50)
    b = np.array([0.9770, -0.9770])
    a = np.array([1.0000, -0.9540])
    fdata = lfilter(b, a, data, axis=0)

    noise_samp, _, n_noise_samp = speechproc.snre_highenergy(
        fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk
    )
    for j in range(n_noise_samp):
        fdata[round(noise_samp[j, 0]): round(noise_samp[j, 1]) + 1] = 0

    vad_seg = speechproc.snre_vad(
        fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres
    )

    return vad_seg

# === Segmentasi Audio ===
def segment_audio(file_path):
    X, sample_rate = librosa.load(file_path, sr=22050, mono=True)
    fvad = getVad(X, sample_rate)

    list_X = []
    temp = []

    for i in range(1, len(fvad)):
        if fvad[i - 1] == 1:
            len_sector = math.floor(len(X) / len(fvad))
            start = (i - 1) * len_sector
            for j in range(start, start + len_sector):
                if j < len(X):
                    temp.append(X[j])
        if fvad[i - 1] == 1 and fvad[i] == 0:
            list_X.append(temp)
            temp = []

    return np.array(list_X, dtype=object), sample_rate

# === Main Processing ===
def main():
    for folder_in, label in INPUT_LABELS.items():
        full_folder_in_path = os.path.join(BASE_PATH, folder_in)

        if not os.path.exists(full_folder_in_path):
            print(f"⚠️ Folder tidak ditemukan: {full_folder_in_path}")
            continue

        print(f"🔍 Memproses folder: {full_folder_in_path}")
        for fname in tqdm(os.listdir(full_folder_in_path)):
            if not fname.lower().endswith(".wav"):
                continue

            barcode = os.path.splitext(fname)[0]

            tb_label = id_to_tb_label.get(barcode, 'Unknown')

            if tb_label == 'Unknown':
                print(f"❓ Label TBC untuk {barcode} tidak ditemukan. Melewati file.")
                continue

            file_path = os.path.join(full_folder_in_path, fname)
            try:
                segments, sr = segment_audio(file_path)
                for i, seg in enumerate(segments):
                    if len(seg) == 0:
                        continue
                    seg = np.asarray(seg, dtype=np.float32)
                    out_fname = f"{os.path.splitext(fname)[0]}_seg{i}.wav"

                    out_path = os.path.join(full_output_path, label, tb_label, out_fname)

                    sf.write(out_path, seg, sr)
            except Exception as e:
                print(f"❌ Error di {fname}: {e}")

if __name__ == "__main__":
    main()

In [None]:
# Chawama
# === Konfigurasi path ===
BASE_PATH = "/content/drive/My Drive/[3] Model ML TBCare/Dataset Google + CIDRZ Health AI Evaluation Zambia"
METADATA_PATH = os.path.join(BASE_PATH, "Metadata and Codebook/GHAI_Final_Data_2023.csv")

INPUT_LABELS = {
    "Audio-Recorder-Chawama": "Audio Rec",
    "Chawama Phone A": "Phone A",
    "Chawama Phone B": "Phone B",
    "Chawama Phone C": "Phone C"
}
OUTPUT_ROOT = "1_Chawama Segmented"

# Buat folder output jika belum ada
full_output_path = os.path.join(BASE_PATH, OUTPUT_ROOT)
if not os.path.exists(full_output_path):
    os.makedirs(full_output_path)

for label in INPUT_LABELS.values():
    os.makedirs(os.path.join(full_output_path, label, "pos"), exist_ok=True)
    os.makedirs(os.path.join(full_output_path, label, "neg"), exist_ok=True)

# Muat metadata dan buat mapping
try:
    metadata_df = pd.read_csv(METADATA_PATH)
    # Buat dictionary untuk mapping barcode ke ground_truth_tb
    id_to_tb_label = dict(zip(metadata_df['barcode'], metadata_df['ground_truth_tb']))
    print("✅ Metadata berhasil dimuat.")
except FileNotFoundError:
    print(f"❌ Error: File metadata tidak ditemukan di {METADATA_PATH}")
    id_to_tb_label = {}

# === Fungsi VAD ===
def getVad(data, fs):
    winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512
    ftThres = 0.5
    vadThres = 0.4
    opts = 1

    ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt)
    pv01 = np.zeros(nfr10)
    pv01[np.less_equal(ft, ftThres)] = 1
    pitch = deepcopy(ft)
    pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts)

    ENERGYFLOOR = np.exp(-50)
    b = np.array([0.9770, -0.9770])
    a = np.array([1.0000, -0.9540])
    fdata = lfilter(b, a, data, axis=0)

    noise_samp, _, n_noise_samp = speechproc.snre_highenergy(
        fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk
    )
    for j in range(n_noise_samp):
        fdata[round(noise_samp[j, 0]): round(noise_samp[j, 1]) + 1] = 0

    vad_seg = speechproc.snre_vad(
        fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres
    )

    return vad_seg

# === Segmentasi Audio ===
def segment_audio(file_path):
    X, sample_rate = librosa.load(file_path, sr=22050, mono=True)
    fvad = getVad(X, sample_rate)

    list_X = []
    temp = []

    for i in range(1, len(fvad)):
        if fvad[i - 1] == 1:
            len_sector = math.floor(len(X) / len(fvad))
            start = (i - 1) * len_sector
            for j in range(start, start + len_sector):
                if j < len(X):
                    temp.append(X[j])
        if fvad[i - 1] == 1 and fvad[i] == 0:
            list_X.append(temp)
            temp = []

    return np.array(list_X, dtype=object), sample_rate

# === Main Processing ===
def main():
    for folder_in, label in INPUT_LABELS.items():
        full_folder_in_path = os.path.join(BASE_PATH, folder_in)

        if not os.path.exists(full_folder_in_path):
            print(f"⚠️ Folder tidak ditemukan: {full_folder_in_path}")
            continue

        print(f"🔍 Memproses folder: {full_folder_in_path}")
        for fname in tqdm(os.listdir(full_folder_in_path)):
            if not fname.lower().endswith(".wav"):
                continue

            barcode = os.path.splitext(fname)[0]

            tb_label = id_to_tb_label.get(barcode, 'Unknown')

            if tb_label == 'Unknown':
                print(f"❓ Label TBC untuk {barcode} tidak ditemukan. Melewati file.")
                continue

            file_path = os.path.join(full_folder_in_path, fname)
            try:
                segments, sr = segment_audio(file_path)
                for i, seg in enumerate(segments):
                    if len(seg) == 0:
                        continue
                    seg = np.asarray(seg, dtype=np.float32)
                    out_fname = f"{os.path.splitext(fname)[0]}_seg{i}.wav"

                    out_path = os.path.join(full_output_path, label, tb_label, out_fname)

                    sf.write(out_path, seg, sr)
            except Exception as e:
                print(f"❌ Error di {fname}: {e}")

if __name__ == "__main__":
    main()

In [None]:
# Kanyama
# === Konfigurasi path ===
BASE_PATH = "/content/drive/My Drive/[3] Model ML TBCare/Dataset Google + CIDRZ Health AI Evaluation Zambia"
METADATA_PATH = os.path.join(BASE_PATH, "Metadata and Codebook/GHAI_Final_Data_2023.csv")

INPUT_LABELS = {
    "Audio-Recorder-Kanyama": "Audio Rec",
    "Kanyama Phone A": "Phone A",
    "Kanyama Phone B": "Phone B",
    "Kanyama Phone C": "Phone C"
}
OUTPUT_ROOT = "1_Kanyama Segmented"

# Buat folder output jika belum ada
full_output_path = os.path.join(BASE_PATH, OUTPUT_ROOT)
if not os.path.exists(full_output_path):
    os.makedirs(full_output_path)

for label in INPUT_LABELS.values():
    os.makedirs(os.path.join(full_output_path, label, "pos"), exist_ok=True)
    os.makedirs(os.path.join(full_output_path, label, "neg"), exist_ok=True)

# Muat metadata dan buat mapping
try:
    metadata_df = pd.read_csv(METADATA_PATH)
    # Buat dictionary untuk mapping barcode ke ground_truth_tb
    id_to_tb_label = dict(zip(metadata_df['barcode'], metadata_df['ground_truth_tb']))
    print("✅ Metadata berhasil dimuat.")
except FileNotFoundError:
    print(f"❌ Error: File metadata tidak ditemukan di {METADATA_PATH}")
    id_to_tb_label = {}

# === Fungsi VAD ===
def getVad(data, fs):
    winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512
    ftThres = 0.5
    vadThres = 0.4
    opts = 1

    ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt)
    pv01 = np.zeros(nfr10)
    pv01[np.less_equal(ft, ftThres)] = 1
    pitch = deepcopy(ft)
    pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts)

    ENERGYFLOOR = np.exp(-50)
    b = np.array([0.9770, -0.9770])
    a = np.array([1.0000, -0.9540])
    fdata = lfilter(b, a, data, axis=0)

    noise_samp, _, n_noise_samp = speechproc.snre_highenergy(
        fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk
    )
    for j in range(n_noise_samp):
        fdata[round(noise_samp[j, 0]): round(noise_samp[j, 1]) + 1] = 0

    vad_seg = speechproc.snre_vad(
        fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres
    )

    return vad_seg

# === Segmentasi Audio ===
def segment_audio(file_path):
    X, sample_rate = librosa.load(file_path, sr=22050, mono=True)
    fvad = getVad(X, sample_rate)

    list_X = []
    temp = []

    for i in range(1, len(fvad)):
        if fvad[i - 1] == 1:
            len_sector = math.floor(len(X) / len(fvad))
            start = (i - 1) * len_sector
            for j in range(start, start + len_sector):
                if j < len(X):
                    temp.append(X[j])
        if fvad[i - 1] == 1 and fvad[i] == 0:
            list_X.append(temp)
            temp = []

    return np.array(list_X, dtype=object), sample_rate

# === Main Processing ===
def main():
    for folder_in, label in INPUT_LABELS.items():
        full_folder_in_path = os.path.join(BASE_PATH, folder_in)

        if not os.path.exists(full_folder_in_path):
            print(f"⚠️ Folder tidak ditemukan: {full_folder_in_path}")
            continue

        print(f"🔍 Memproses folder: {full_folder_in_path}")
        for fname in tqdm(os.listdir(full_folder_in_path)):
            if not fname.lower().endswith(".wav"):
                continue

            barcode = os.path.splitext(fname)[0]

            tb_label = id_to_tb_label.get(barcode, 'Unknown')

            if tb_label == 'Unknown':
                print(f"❓ Label TBC untuk {barcode} tidak ditemukan. Melewati file.")
                continue

            file_path = os.path.join(full_folder_in_path, fname)
            try:
                segments, sr = segment_audio(file_path)
                for i, seg in enumerate(segments):
                    if len(seg) == 0:
                        continue
                    seg = np.asarray(seg, dtype=np.float32)
                    out_fname = f"{os.path.splitext(fname)[0]}_seg{i}.wav"

                    out_path = os.path.join(full_output_path, label, tb_label, out_fname)

                    sf.write(out_path, seg, sr)
            except Exception as e:
                print(f"❌ Error di {fname}: {e}")

if __name__ == "__main__":
    main()