In [None]:
!pip install streamlit librosa tensorflow torchaudio transformers scipy matplotlib scikit-learn librosa resampy
!pip install pyngrok
!pip install pydub
!apt-get update
!apt-get install -y ffmpeg
!pip install pyngrok
!pip install audio-recorder-streamlit
!pip install reportlab
!pip install fpdf2

Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m885.3 kB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%%writefile trueetone_app.py
import streamlit as st
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
import torch
import torchaudio
import io
import os
from PIL import Image
from scipy.stats import skew, kurtosis, median_abs_deviation
import torch.nn.functional as F
import matplotlib.pyplot as plt
from io import BytesIO
from pydub import AudioSegment
from audio_recorder_streamlit import audio_recorder
from fpdf import FPDF
from datetime import datetime

# Function to load image
def load_image(image_path):
    try:
        with open(image_path, "rb") as file:
            image_data = file.read()
            if not image_data:
                st.error(f"Image file {image_path} is empty.")
                return None
            return Image.open(io.BytesIO(image_data))
    except Exception as e:
        st.error(f"Error loading image {image_path}: {e}")
        return None

# Load Models (Ensure these paths are correct for your Google Drive)
@st.cache_resource()
def load_models():
    try:
        dnn_model = tf.keras.models.load_model("/content/drive/MyDrive/TrueeTone/Training/Saved Models/pre_trained_dense_model.h5")
        cnn_model = tf.keras.models.load_model("/content/drive/MyDrive/TrueeTone/Training/Saved Models/pre_trained_cnn_model.h5")  # Change path if needed
        bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
        wav2vec_model = bundle.get_model()
        return dnn_model, cnn_model, wav2vec_model, bundle
    except Exception as e:
        st.error(f"Error loading models: {e}")
        return None, None, None, None

dnn_model, cnn_model, wav2vec_model, bundle = load_models()

def convert_to_wav(audio_file):
    """Converts an uploaded audio file to WAV format."""
    try:
        audio = AudioSegment.from_file(audio_file)
        wav_io = BytesIO()
        audio.export(wav_io, format="wav")
        wav_io.seek(0)
        return wav_io
    except Exception as e:
        st.error(f"Error converting audio to WAV: {e}")
        return None

# Prediction Functions
def predict_dnn(audio_file_content, file_name):
    try:
        # Convert to WAV
        wav_audio = convert_to_wav(io.BytesIO(audio_file_content))
        if wav_audio is None:
            return None

        # Save converted WAV file to a temporary location
        temp_file_path = f"/tmp/{file_name}.wav"  # Append .wav extension
        with open(temp_file_path, "wb") as temp_file:
            temp_file.write(wav_audio.read())

        sound_signal, sample_rate = librosa.load(temp_file_path, res_type="kaiser_fast")
        mfcc_features = librosa.feature.mfcc(y=sound_signal, sr=sample_rate, n_mfcc=40)
        mfccs_features_scaled = np.mean(mfcc_features.T, axis=0)
        mfccs_features_scaled = mfccs_features_scaled.reshape(1, -1)
        result_array = dnn_model.predict(mfccs_features_scaled)
        result_classes = ["AI", "Human"]
        result = np.argmax(result_array[0])
        return result_classes[result]
    except Exception as e:
        st.error(f"Error in DNN prediction: {e}")
        return None

def predict_cnn(audio_file_content, file_name):
    try:
        # Convert to WAV
        wav_audio = convert_to_wav(io.BytesIO(audio_file_content))
        if wav_audio is None:
            return None

        # Save converted WAV file to a temporary location
        temp_file_path = f"/tmp/{file_name}.wav"  # Append .wav extension
        with open(temp_file_path, "wb") as temp_file:
            temp_file.write(wav_audio.read())

        sound_signal, sample_rate = librosa.load(temp_file_path, res_type="kaiser_fast")
        mfcc_features = librosa.feature.mfcc(y=sound_signal, sr=sample_rate, n_mfcc=40)
        mfccs_features_scaled = np.mean(mfcc_features.T, axis=0)

        # Reshape the input to match the CNN model's expected shape (None, 40, 1, 1)
        mfccs_features_scaled = mfccs_features_scaled.reshape(1, 40, 1, 1)  # Reshape to (1, 40, 1, 1)

        result_array = cnn_model.predict(mfccs_features_scaled)
        result_classes = ["AI", "Human"]
        result = np.argmax(result_array[0])
        return result_classes[result]
    except Exception as e:
        st.error(f"Error in CNN prediction: {e}")
        return None

def extract_features(audio_file_content, file_name, bundle, model):
    try:
        # Convert to WAV
        wav_audio = convert_to_wav(io.BytesIO(audio_file_content))
        if wav_audio is None:
            return None

        # Save converted WAV file to a temporary location
        temp_file_path = f"/tmp/{file_name}.wav"  # Append .wav extension
        with open(temp_file_path, "wb") as temp_file:
            temp_file.write(wav_audio.read())

        waveform, sample_rate = torchaudio.load(temp_file_path)
        if sample_rate != bundle.sample_rate:
            waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=bundle.sample_rate)(waveform)

        with torch.inference_mode():
            features, _ = model.extract_features(waveform)

        pooled_features = []
        for f in features:
            if f.dim() == 3:
                f = f.permute(0, 2, 1)
                pooled_f = F.adaptive_avg_pool1d(f[0].unsqueeze(0), 1).squeeze(0)
                pooled_features.append(pooled_f)

        final_features = torch.cat(pooled_features, dim=0).numpy()
        final_features = (final_features - np.mean(final_features)) / (np.std(final_features) + 1e-10)
        return final_features
    except Exception as e:
        st.error(f"Error extracting features: {e}")
        return None

def additional_features(features):
    if features is None:
        return None, None
    mad = median_abs_deviation(features)
    features_clipped = np.clip(features, 1e-10, None)
    entropy = -np.sum(features_clipped * np.log(features_clipped))
    return mad, entropy

def classify_audio(features):
    if features is None:
        return None, None
    mean_value = np.mean(features)
    variance_value = np.var(features)
    skewness_value = skew(features)[0]
    kurtosis_value = kurtosis(features)[0]
    _, entropy = additional_features(features)
    if entropy is None:
        return None, None
    if entropy > 200:
        return "Human", entropy
    else:
        return "AI", entropy

def predict_wav2vec(audio_file_content, file_name, bundle, model):
    try:
        features = extract_features(audio_file_content, file_name, bundle, model)
        if features is not None:
            prediction, entropy = classify_audio(features)
            return prediction, entropy
        else:
            return None, None
    except Exception as e:
        st.error(f"Error in Wav2Vec prediction: {e}")
        return None, None

# Spectrogram Function
def plot_mel_spectrogram(audio_file_content, file_name):
    try:
        # Convert to WAV for plotting spectrogram.
        wav_audio = convert_to_wav(io.BytesIO(audio_file_content))
        if wav_audio is None:
            return

        # Save converted WAV file to a temporary location
        temp_file_path = f"/tmp/{file_name}.wav"
        with open(temp_file_path, "wb") as temp_file:
            temp_file.write(wav_audio.read())

        y, sr = librosa.load(temp_file_path)
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
        mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

        fig, ax = plt.subplots()
        img = librosa.display.specshow(mel_spectrogram_db, x_axis='time', y_axis='mel', sr=sr, fmax=8000, ax=ax)
        fig.colorbar(img, ax=ax, format='%+2.0f dB')
        ax.set(title='Mel-frequency spectrogram')
        st.pyplot(fig)

    except Exception as e:
        st.error(f"Error plotting spectrogram: {e}")

class PDF(FPDF):
    def header(self):
        self.set_fill_color(255, 253, 208)
        self.rect(0, 0, 210, 297, style='F')
        self.rect(5, 5, 200, 287)

# Function to generate PDF report
def generate_report(file_name, duration, file_type, dnn_prediction, cnn_prediction, wav2vec_prediction, entropy, final_description):
    pdf = PDF()
    pdf.add_page()

    # Add Logo
    logo_width = 20
    logo_x = (210 - logo_width) / 2
    pdf.image("/content/drive/MyDrive/TrueeTone/Training/Images/logo.png", x=logo_x, y=10, w=logo_width)
    pdf.ln(25)

    # Set Title
    pdf.set_font("Helvetica", "BI", 18)
    pdf.cell(200, 10, txt="TrueeTone: Audio Authenticity Detection", ln=True, align='C')
    pdf.set_font("Arial",'I',14)
    pdf.cell(200, 10, txt="Audio Analysis Report", ln=True, align='C')
    pdf.ln(5)

    pdf.set_font("Arial", 'B', 8)
    current_date = datetime.now().strftime("%Y-%m-%d")  # Format: YYYY-MM-DD
    current_time = datetime.now().strftime("%H:%M:%S")  # Format: HH:MM:SS
    pdf.cell(200, 10, txt=f"Report Generated On: Date: {current_date}, Time: {current_time}", ln=True, align='C')
    pdf.ln(5)

    # Audio File Details
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(200, 10, txt="Details on Audio File", ln=True, align='C')
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt=f"File Name: {file_name}", ln=True)
    pdf.cell(200, 10, txt=f"Duration: {duration:.2f} seconds", ln=True)
    pdf.cell(200, 10, txt=f"File Type: {file_type.upper()}", ln=True)
    pdf.ln(5)
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())

    # Predictions
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(200, 10, txt="Predictions", ln=True, align='C')
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt=f"DNN Prediction: {dnn_prediction}", ln=True)
    pdf.cell(200, 10, txt=f"CNN Prediction: {wav2vec_prediction}", ln=True)
    pdf.cell(200, 10, txt=f"Wav2Vec Prediction: {wav2vec_prediction} (Entropy: {entropy:.2f})", ln=True)
    pdf.ln(5)
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())

    # Plots Section
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(200, 10, txt="Audio Data Analysis", ln=True, align='C')
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())

    # Ensure plots exist before adding
    plot_paths = [
        ("/tmp/audio_waveform.png", "Waveform"),
        ("/tmp/mel_spectrogram.png", "Mel Spectrogram"),
        ("/tmp/mfcc.png", "MFCC Features")
    ]
    pdf.ln(5)
    for path, title in plot_paths:
      if os.path.exists(path):
        image_width = 100
        x_centered = (210 - image_width) / 2
        pdf.image(path, x=x_centered, w=image_width)
        pdf.set_font("Arial", 'I', 12)
        pdf.cell(200, 10, txt=title, ln=True, align='C')
        pdf.ln(5)
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())

    # Final Description
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(200, 10, txt="Final Description", ln=True, align='C')
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, txt=final_description)
    pdf.ln(10)

    # Importance of Audio Authenticity Detection (Light Gray Heading)
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())
    pdf.set_text_color(0, 0, 169)  # Dark Blue Color
    pdf.set_font("Arial", 'B', 14)
    pdf.cell(200, 10, txt="Importance of Audio Authenticity Detection", ln=True, align='C')
    pdf.line(10, pdf.get_y(), 200, pdf.get_y())  # Separator line
    pdf.set_text_color(0, 0, 139)  # Dark Blue Color
    pdf.set_font("Arial", "", 12)
    importance_text = """With the rise of AI-generated content, ensuring the authenticity of audio recordings has become critical. Misinformation, deepfake technology, and synthetic speech pose serious threats to security, journalism, and personal identity. Detecting AI-generated audio helps prevent fraud, protect intellectual property, and maintain trust in digital communications. TrueeTone's advanced models empower users to verify the authenticity of voice recordings, making digital interactions safer and more transparent."""
    pdf.multi_cell(0, 10, txt=importance_text)
    pdf.ln(10)

    # Save the PDF
    pdf_output = f"/tmp/{file_name}_report.pdf"
    pdf.output(pdf_output)
    return pdf_output


# Home Page
def home_page():

    st.title("🔊 TrueeTone: Audio Authenticity Detection")

    st.markdown("""
    ## Introduction
    In the age of artificial intelligence, distinguishing between human and AI-generated voices has become a significant challenge.
    With the rise of deepfake technology and synthetic speech, there is an increasing need for reliable detection mechanisms.
    **TrueeTone: Audio Authenticity Detection System** is an advanced AI-ML-powered solution designed to differentiate between real human voices and AI-generated audio with high accuracy.
    """)

    # Add Audio Authenticity Photo
    st.image("/content/drive/MyDrive/TrueeTone/Training/Images/bg.jpeg")

    st.markdown("""
    ## Overview
    TrueeTone is built using an ensemble classification approach, leveraging multiple machine learning models to ensure robust and precise results.
    By analyzing unique audio features, the system provides a comprehensive authenticity evaluation for any given audio clip.
    The application is designed to be user-friendly, allowing individuals, researchers, and organizations to verify the legitimacy of voice recordings effortlessly.
    """)

    st.image("/content/drive/MyDrive/TrueeTone/Training/Images/logo.png")

    st.markdown("""
    ## Core Technology
    The system employs the following models to detect AI-generated voices:
    1. **Dense Neural Network (DNN) with MFCC Features** – Extracts Mel-Frequency Cepstral Coefficients (MFCC) features and uses a dense neural network to classify the audio as real or fake.
    2. **Convolutional Neural Network (CNN) with MFCC Features** – Enhances feature extraction capabilities using CNN layers for improved accuracy.
    3. **Pretrained Wav2Vec-960h Model** – A state-of-the-art model that analyzes entropy values to make precise predictions about audio authenticity.
    """)

    # Add Model Accuracy Graphs
    st.image("/content/drive/MyDrive/TrueeTone/Training/Images/ModelAccCNN.png", caption="CNN Model Accuracy")
    st.image("/content/drive/MyDrive/TrueeTone/Training/Images/ModelAccDNN.png", caption="Dense Model Accuracy")

    st.markdown("""
    ## How It Works
    Users can interact with TrueeTone through a Streamlit-based web application, which provides:
    - **Audio Upload & Recording**: Users can either upload an audio file or record their voice directly in the app.
    - **Mel Spectrogram Visualization**: A visual representation of the audio signal to understand its frequency distribution.
    - **Multi-Model Predictions**: Displays results from all three models, highlighting the most confident prediction (typically Wav2Vec-based analysis).
    """)

    st.markdown("""
    ## Outcomes
    TrueeTone serves as a crucial tool in combating the spread of AI-generated misinformation in audio content.
    By combining deep learning with advanced signal processing, the system offers a reliable and efficient way to authenticate voice recordings.
    As synthetic voice technology evolves, TrueeTone will continue to adapt, ensuring transparency and trust in digital communications.
    """)

# Prediction Page
def prediction_page():
    st.title("🎙️ Audio Authenticity Prediction")

    # Audio Input Options
    option = st.radio("Choose an option:", ("Upload Audio File", "Record Audio"))

    if option == "Upload Audio File":
        audio_file = st.file_uploader("Upload Audio File", type=["wav", "mp3", "ogg"])
        if audio_file is not None:
            st.audio(audio_file, format="audio/wav")
            process_audio(audio_file.read(), audio_file.name)

    elif option == "Record Audio":
        st.write("Click the button below to start recording:")
        # Center the microphone
        col1, col2, col3 = st.columns([1, 2, 1])
        with col2:
            audio_bytes = audio_recorder()
        if audio_bytes:
            st.audio(audio_bytes, format="audio/wav")
            process_audio(audio_bytes, "recorded_audio.wav")

def process_audio(audio_file_content, file_name):
    """Processes the audio file (uploaded or recorded) and displays predictions."""
    # Convert to WAV for audio display and processing
    wav_audio = convert_to_wav(io.BytesIO(audio_file_content))
    if wav_audio is None:
        return

    # Display Mel Spectrogram
    st.subheader("Mel Spectrogram")
    plot_mel_spectrogram(audio_file_content, file_name)

    # Predictions
    st.subheader("Predictions")

    dnn_prediction = predict_dnn(audio_file_content, file_name)

    cnn_prediction = predict_cnn(audio_file_content, file_name)

    wav2vec_prediction, entropy = predict_wav2vec(audio_file_content, file_name, bundle, wav2vec_model)

    if dnn_prediction:
        st.write(f"DNN Prediction: {dnn_prediction}")

    if cnn_prediction:
        st.write(f"CNN Prediction: {wav2vec_prediction}")

    if wav2vec_prediction:
        st.write(f"Wav2Vec Prediction: {wav2vec_prediction} (Entropy: {entropy:.2f})")

    # Best Prediction (Based on Wav2Vec)
    st.subheader("Best Prediction")
    if wav2vec_prediction == "AI":
        st.warning("This audio is likely AI-generated.")
        final_description = "The audio is likely AI-generated based on the entropy value and model predictions."
    elif wav2vec_prediction == "Human":
        st.success("This audio is likely Human-generated.")
        final_description = "The audio is likely Human-generated based on the entropy value and model predictions."
    else:
        st.write("Unable to determine audio authenticity.")
        final_description = "Unable to determine audio authenticity based on the provided data."

    # Generate Report
    #st.subheader("Generate Report")
    if st.button("Generate Report"):
        # Save plots to temporary files
        plt.figure()
        y, sr = librosa.load(f"/tmp/{file_name}.wav")
        plt.plot(y)
        plt.title("Audio Waveform")
        plt.xlabel("Time")
        plt.ylabel("Amplitude")
        plt.savefig("/tmp/audio_waveform.png")
        plt.close()

        # Save Mel Spectrogram
        #plot_mel_spectrogram(audio_file_content, file_name)
        plt.savefig("/tmp/mel_spectrogram.png")
        plt.close()

        # Save MFCC Plot
        y, sr = librosa.load(f"/tmp/{file_name}.wav")
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(mfccs, x_axis='time')
        plt.colorbar()
        plt.title('MFCC')
        plt.tight_layout()
        plt.savefig("/tmp/mfcc.png")
        plt.close()

        # Get audio duration
        duration = librosa.get_duration(y=y, sr=sr)

        # Get file type
        file_type = file_name.split(".")[-1]

        # Generate PDF report
        pdf_path = generate_report(
            file_name=file_name,
            duration=duration,
            file_type=file_type,
            dnn_prediction=dnn_prediction,
            cnn_prediction=cnn_prediction,
            wav2vec_prediction=wav2vec_prediction,
            entropy=entropy,
            final_description=final_description
        )

        # Provide download link for the report
        with open(pdf_path, "rb") as pdf_file:
            pdf_bytes = pdf_file.read()
        st.download_button(
            label="Download Report",
            data=pdf_bytes,
            file_name=f"{file_name}_report.pdf",
            mime="application/pdf"
        )

# Main App
def main():
    st.sidebar.title("Dashboard")
    page = st.sidebar.selectbox("Select Page", ["Home", "Prediction"])

    if page == "Home":
        home_page()
    elif page == "Prediction":
        prediction_page()

if __name__ == "__main__":
    main()

Writing trueetone_app.py


In [None]:
!streamlit run trueetone_app.py &>/dev/null&

In [None]:
!pip install pyngrok
from pyngrok import ngrok

# Replace 'YOUR_NGROK_AUTHTOKEN' with your actual ngrok authtoken
ngrok.set_auth_token('2tMZiGmKKB3qVX6bI2qWyCybtWs_ZURjBYw4NBuMC2iGMc6r')

# Terminate open tunnels if any
ngrok.kill()

# Start a new ngrok tunnel
public_url = ngrok.connect(addr='8501', proto='http')
print("Public URL:", public_url)

Public URL: NgrokTunnel: "https://f12c-34-72-154-183.ngrok-free.app" -> "http://localhost:8501"
