In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Audio Difference Detection: Replacement Demo\n",
    "\n",
    "This notebook demonstrates how to detect word replacement in audio using mel-spectrograms and the LCS algorithm."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import numpy as np\n",
    "import soundfile as sf\n",
    "import librosa\n",
    "import matplotlib.pyplot as plt\n",
    "from gtts import gTTS\n",
    "import io\n",
    "import IPython.display as ipd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Create Audio Samples\n",
    "\n",
    "First, let's create two audio samples with a replaced word."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def text_to_audio(text, sr=16000):\n",
    "    \"\"\"Convert text to audio using Google TTS\"\"\"\n",
    "    tts = gTTS(text=text, lang='en')\n",
    "    fp = io.BytesIO()\n",
    "    tts.write_to_fp(fp)\n",
    "    fp.seek(0)\n",
    "    y, _ = librosa.load(fp, sr=sr)\n",
    "    return y\n",
    "\n",
    "# Create audio segments\n",
    "sr = 16000  # Sample rate\n",
    "silence = np.zeros(int(0.2 * sr))  # 200ms silence\n",
    "\n",
    "# Generate word audio\n",
    "y1 = text_to_audio(\"the\", sr=sr)\n",
    "y2_original = text_to_audio(\"big\", sr=sr)\n",
    "y2_modified = text_to_audio(\"small\", sr=sr)\n",
    "y3 = text_to_audio(\"cat\", sr=sr)\n",
    "\n",
    "# Create sequences\n",
    "original = np.concatenate([y1, silence, y2_original, silence, y3])\n",
    "modified = np.concatenate([y1, silence, y2_modified, silence, y3])\n",
    "\n",
    "# Save audio files\n",
    "sf.write('original.wav', original, sr)\n",
    "sf.write('modified.wav', modified, sr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listen to the Audio Samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "print(\"Original Audio:\")\n",
    "ipd.display(ipd.Audio('original.wav'))\n",
    "\n",
    "print(\"\\nModified Audio:\")\n",
    "ipd.display(ipd.Audio('modified.wav'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Extract Features\n",
    "\n",
    "Convert audio to mel-spectrograms for comparison."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def compute_features(audio):\n",
    "    \"\"\"Convert audio to mel-spectrogram features\"\"\"\n",
    "    mel_spec = librosa.feature.melspectrogram(\n",
    "        y=audio,\n",
    "        sr=sr,\n",
    "        n_mels=128,\n",
    "        hop_length=512,\n",
    "        win_length=2048\n",
    "    )\n",
    "    mel_db = librosa.power_to_db(mel_spec, ref=np.max)\n",
    "    return mel_db\n",
    "\n",
    "# Extract features\n",
    "features_original = compute_features(original)\n",
    "features_modified = compute_features(modified)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize Mel-Spectrograms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "plt.figure(figsize=(15, 5))\n",
    "\n",
    "plt.subplot(1, 2, 1)\n",
    "librosa.display.specshow(features_original, sr=sr, x_axis='time', y_axis='mel')\n",
    "plt.title('Original Audio')\n",
    "plt.colorbar(format='%+2.0f dB')\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "librosa.display.specshow(features_modified, sr=sr, x_axis='time', y_axis='mel')\n",
    "plt.title('Modified Audio')\n",
    "plt.colorbar(format='%+2.0f dB')\n",
    "\n",
    "plt.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Calculate Distance Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def compute_distance_matrix(features1, features2):\n",
    "    \"\"\"Calculate frame-to-frame distances\"\"\"\n",
    "    distance_matrix = np.zeros((features1.shape[1], features2.shape[1]))\n",
    "    for i in range(features1.shape[1]):\n",
    "        for j in range(features2.shape[1]):\n",
    "            distance_matrix[i,j] = np.linalg.norm(features1[:,i] - features2[:,j])\n",
    "    return distance_matrix\n",
    "\n",
    "# Compute distance matrix\n",
    "distance_matrix = compute_distance_matrix(features_original, features_modified)\n",
    "\n",
    "# Visualize distance matrix\n",
    "plt.figure(figsize=(10, 8))\n",
    "plt.imshow(distance_matrix, aspect='auto', origin='lower')\n",
    "plt.colorbar(label='Distance')\n",
    "plt.title('Frame-to-Frame Distance Matrix')\n",
    "plt.xlabel('Modified Audio Frames')\n",
    "plt.ylabel('Original Audio Frames')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Find Matching Segments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def find_matches(distance_matrix, threshold=None):\n",
    "    \"\"\"Find matching segments using LCS algorithm\"\"\"\n",
    "    if threshold is None:\n",
    "        threshold = np.mean(distance_matrix) + np.std(distance_matrix)\n",
    "    \n",
    "    m, n = distance_matrix.shape\n",
    "    dp = np.zeros((m+1, n+1))\n",
    "    \n",
    "    for i in range(1, m+1):\n",
    "        for j in range(1, n+1):\n",
    "            if distance_matrix[i-1,j-1] < threshold:\n",
    "                dp[i,j] = dp[i-1,j-1] + 1\n",
    "            else:\n",
    "                dp[i,j] = max(dp[i-1,j], dp[i,j-1])\n",
    "    \n",
    "    return dp, threshold\n",
    "\n",
    "# Find matches\n",
    "dp_matrix, threshold = find_matches(distance_matrix)\n",
    "\n",
    "# Visualize matching matrix\n",
    "plt.figure(figsize=(10, 8))\n",
    "plt.imshow(dp_matrix, aspect='auto', origin='lower')\n",
    "plt.colorbar(label='Match Length')\n",
    "plt.title('Matching Segments Matrix')\n",
    "plt.xlabel('Modified Audio Frames')\n",
    "plt.ylabel('Original Audio Frames')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Visualize Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def plot_waveforms_with_matches(original, modified, matches):\n",
    "    \"\"\"Plot waveforms with matching connections\"\"\"\n",
    "    plt.figure(figsize=(15, 6))\n",
    "    \n",
    "    # Plot original waveform\n",
    "    plt.subplot(2, 1, 1)\n",
    "    plt.plot(original)\n",
    "    plt.title('Original Audio')\n",
    "    plt.ylabel('Amplitude')\n",
    "    \n",
    "    # Plot modified waveform\n",
    "    plt.subplot(2, 1, 2)\n",
    "    plt.plot(modified)\n",
    "    plt.title('Modified Audio')\n",
    "    plt.ylabel('Amplitude')\n",
    "    plt.xlabel('Samples')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "\n",
    "# Plot results\n",
    "plot_waveforms_with_matches(original, modified, dp_matrix)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}