In [None]:
!apt-get update
!apt-get install -y ffmpeg
!pip install -U kaleido reportlab pydub tqdm plotly scipy ffmpeg-python numpy

0% [Working]            Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,032 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:13 https://ppa.launchpadcontent.net/ubuntugis/p

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# New Section

In [None]:
import os
import numpy as np
import tqdm
import plotly.graph_objects as go
import subprocess
from pydub import AudioSegment
from scipy.fftpack import fft
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

# Configuration
FPS = 30
FFT_WINDOW_SECONDS = 0.25
FREQ_MIN = 10
FREQ_MAX = 1000
TOP_NOTES = 3
RESOLUTION = (1920, 1080)
SCALE = 2
AUDIO_FILE = "flute1.wav"  # Path to your audio file

# Notation mappings
INDIAN_NOTE_NAMES = {
    0: "Sa", 1: "Re", 2: "Ga", 3: "Ma", 4: "Pa", 5: "Dha", 6: "Ni",
    7: "Sa", 8: "komal Re", 9: "komal Ga", 10: "Teevra Ma", 11: "komal Dha", 12: "komal Ni",
    13: "Teevra Sa", 14: "Teevra Re", 15: "Teevra Ga", 16: "Teevra Ma", 17: "Teevra Pa", 18: "Teevra Dha", 19: "Teevra Ni"
}

# Load the audio file
audio_segment = AudioSegment.from_file(AUDIO_FILE)
fs = audio_segment.frame_rate
audio = np.array(audio_segment.get_array_of_samples(), dtype=float)

# Handle stereo audio by averaging channels if necessary
if audio_segment.channels == 2:
    audio = audio.reshape(-1, 2).mean(axis=1)

FRAME_STEP = (fs / FPS)
FFT_WINDOW_SIZE = int(fs * FFT_WINDOW_SECONDS)
AUDIO_LENGTH = len(audio) / fs

def plot_fft(p, xf, fs, notes, dimensions=(960,540)):
    layout = go.Layout(
        title="Frequency Spectrum",
        autosize=False,
        width=dimensions[0],
        height=dimensions[1],
        xaxis_title="Frequency (note)",
        yaxis_title="Magnitude",
        font={'size': 24}
    )
    fig = go.Figure(layout=layout,
                    layout_xaxis_range=[FREQ_MIN, FREQ_MAX],
                    layout_yaxis_range=[0, 1])
    fig.add_trace(go.Scatter(x=xf, y=p))
    for note in notes:
        fig.add_annotation(x=note[0] + 10, y=note[2],
                           text=note[1],
                           font={'size': 48},
                           showarrow=False)
    return fig

def extract_sample(audio, frame_number):
    end = frame_number * FRAME_OFFSET
    begin = int(end - FFT_WINDOW_SIZE)
    if end == 0:
        return np.zeros((np.abs(begin)), dtype=float)
    elif begin < 0:
        return np.concatenate([np.zeros((np.abs(begin)), dtype=float), audio[0:end]])
    else:
        return audio[begin:end]

def find_top_notes(fft, num):
    if np.max(fft.real) < 0.001:
        return []
    lst = [x for x in enumerate(fft.real)]
    lst = sorted(lst, key=lambda x: x[1], reverse=True)
    idx = 0
    found = []
    found_note = set()
    while (idx < len(lst)) and (len(found) < num):
        f = xf[lst[idx][0]]
        y = lst[idx][1]
        n = freq_to_number(f)
        n0 = int(round(n))
        name = indian_note_name(n0)
        if name not in found_note:
            found_note.add(name)
            s = [f, name, y]
            found.append(s)
        idx += 1
    return found

def freq_to_number(f): return 69 + 12 * np.log2(f / 440.0)
def number_to_freq(n): return 440 * 2.0 ** ((n - 69) / 12.0)

# Function to map Western notes to Indian notes with conventions
def indian_note_name(n):
    index = n % 12
    octave = int(n / 12) - 1
    indian_note = INDIAN_NOTE_NAMES.get(index, "Unknown")
    if "komal" in indian_note:
        indian_note = indian_note.replace("komal", "komal ")
    if "Teevra" in indian_note:
        indian_note = indian_note.replace("Teevra", "Teevra ")
    if index == 3:  # Teevra Ma
        indian_note = "Teevra Ma"
    return f"{indian_note}{octave}"

# Hanning window function
window = 0.5 * (1 - np.cos(np.linspace(0, 2 * np.pi, FFT_WINDOW_SIZE, False)))
xf = np.fft.rfftfreq(FFT_WINDOW_SIZE, 1 / fs)
FRAME_COUNT = int(AUDIO_LENGTH * FPS)
FRAME_OFFSET = int(len(audio) / FRAME_COUNT)

# Pass 1: Find out the maximum amplitude so we can scale.
mx = 0
for frame_number in range(FRAME_COUNT):
    sample = extract_sample(audio, frame_number)
    fft = np.fft.rfft(sample * window)
    fft = np.abs(fft).real
    mx = max(np.max(fft), mx)

print(f"Max amplitude: {mx}")

# Pass 2: Produce the animation and PDF of notations
pdf_file = "notations.pdf"
c = canvas.Canvas(pdf_file, pagesize=letter)
width, height = letter

# Add "SwarSetu" logo type text
c.setFont("Helvetica-Bold", 24)
c.drawString(50, height - 50, "SwarSetu")

# Add title
c.setFont("Helvetica-Bold", 16)
c.drawString(50, height - 80, "Musical Notations from Audio Analysis")

# Add notations to PDF
c.setFont("Helvetica", 12)
y_position = height - 120

notations_set = set()  # Use a set to collect unique notations

for frame_number in tqdm.tqdm(range(FRAME_COUNT)):
    sample = extract_sample(audio, frame_number)
    fft = np.fft.rfft(sample * window)
    fft = np.abs(fft) / mx
    s = find_top_notes(fft, TOP_NOTES)

    for note in s:
        notations_set.add(note[1])

# Write notations to the PDF
for note in sorted(notations_set):
    c.drawString(50, y_position, note)
    y_position -= 20

    # If we reach the end of the page, add a new page
    if y_position < 50:
        c.showPage()
        c.setFont("Helvetica", 12)
        y_position = height - 50

# Save the PDF
c.save()
# Generate image frames
for frame_number in tqdm.tqdm(range(FRAME_COUNT)):
    sample = extract_sample(audio, frame_number)
    fft = np.fft.rfft(sample * window)
    fft = np.abs(fft) / mx
    s = find_top_notes(fft, TOP_NOTES)

    # Create a visualization for the current frame (e.g., a spectrogram)
    fig = plot_fft(fft, xf, fs, s)  # Assuming plot_fft creates a visualization

    # Save the visualization as an image file with sequential naming
    img_filename = f"frame{frame_number:04d}.png"
    fig.write_image(img_filename, engine="kaleido")
# # Run FFmpeg to create the video
# ffmpeg_command = [
#    'ffmpeg',
#     '-y',  # Overwrite output file if it exists
#     '-r', str(FPS),  # Frame rate
#     '-f', 'image2',  # Input format (image sequence)
#     '-s', '1920x1080',  # Output resolution
#     '-i', 'frame%d.png',  # Input file pattern for images
#     '-i', AUDIO_FILE,  # Input audio file
#     '-c:v', 'libx264',  # Video codec
#     '-pix_fmt', 'yuv420p',  # Pixel format
#     'movie.mp4'  # Output video file
# ]

# # Execute the FFmpeg command
# subprocess.run(ffmpeg_command, check=True)

print("Video creation complete and PDF of notations generated.")


Max amplitude: 12820516.52442033


100%|██████████| 901/901 [00:03<00:00, 242.56it/s]
100%|██████████| 901/901 [02:38<00:00,  5.69it/s]

Video creation complete and PDF of notations generated.





In [None]:
!ffmpeg -y -r 30 -f image2 -s 1920x1080 -i frame%04d.png -i flute1.wav -c:v libx264 -pix_fmt yuv420p movie.mp4

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
subprocess.run(['ffmpeg', '-version'])

CompletedProcess(args=['ffmpeg', '-version'], returncode=0)

In [None]:
# import os
# import numpy as np
# import tqdm
# import plotly.graph_objects as go
# import subprocess
# from pydub import AudioSegment
# from scipy.fftpack import fft
# from reportlab.lib.pagesizes import letter
# from reportlab.pdfgen import canvas

# # Configuration
# FPS = 30
# FFT_WINDOW_SECONDS = 0.25
# FREQ_MIN = 10
# FREQ_MAX = 1000
# TOP_NOTES = 3
# RESOLUTION = (1920, 1080)
# SCALE = 2
# AUDIO_FILE = "flute1.wav"  # Path to your audio file

# # Notation mappings
# INDIAN_NOTE_NAMES = {
#     0: "Sa", 1: "Re", 2: "Ga", 3: "Ma", 4: "Pa", 5: "Dha", 6: "Ni",
#     7: "Sa", 8: "komal Re", 9: "komal Ga", 10: "Teevra Ma", 11: "komal Dha", 12: "komal Ni",
#     13: "Teevra Sa", 14: "Teevra Re", 15: "Teevra Ga", 16: "Teevra Ma", 17: "Teevra Pa", 18: "Teevra Dha", 19: "Teevra Ni"
# }
# INDIAN_NOTE_CONVENTIONS = """
# Notation Documentation:
# - Lower case letters represent Komal swara:
#   - 'n' represents Komal Ni
#   - 'r' represents Komal Re
#   - 'g' represents Komal Ga
#   - 'd' represents Komal Dha
#   - 'm' represents Komal Ma (only in case of `Ma` variation)
# - Upper case letters represent Shudh swara:
#   - 'P' represents Shudh Pa
#   - 'S' represents Shudh Sa
#   - 'R' represents Shudh Re
#   - 'G' represents Shudh Ga
#   - 'D' represents Shudh Dha
#   - 'M' represents Shudh Ma
# - '#' signifies Teevra sur:
#   - 'M#' represents Teevra Ma
# """

# # Load the audio file
# audio_segment = AudioSegment.from_file(AUDIO_FILE)
# fs = audio_segment.frame_rate
# audio = np.array(audio_segment.get_array_of_samples(), dtype=float)

# # Handle stereo audio by averaging channels if necessary
# if audio_segment.channels == 2:
#     audio = audio.reshape(-1, 2).mean(axis=1)

# FRAME_STEP = (fs / FPS)
# FFT_WINDOW_SIZE = int(fs * FFT_WINDOW_SECONDS)
# AUDIO_LENGTH = len(audio) / fs

# def plot_fft(p, xf, fs, notes, dimensions=(960,540)):
#     layout = go.Layout(
#         title="Frequency Spectrum",
#         autosize=False,
#         width=dimensions[0],
#         height=dimensions[1],
#         xaxis_title="Frequency (note)",
#         yaxis_title="Magnitude",
#         font={'size': 24}
#     )
#     fig = go.Figure(layout=layout,
#                     layout_xaxis_range=[FREQ_MIN, FREQ_MAX],
#                     layout_yaxis_range=[0, 1])
#     fig.add_trace(go.Scatter(x=xf, y=p))
#     for note in notes:
#         fig.add_annotation(x=note[0] + 10, y=note[2],
#                            text=note[1],
#                            font={'size': 48},
#                            showarrow=False)
#     return fig

# def extract_sample(audio, frame_number):
#     end = frame_number * FRAME_OFFSET
#     begin = int(end - FFT_WINDOW_SIZE)
#     if end == 0:
#         return np.zeros((np.abs(begin)), dtype=float)
#     elif begin < 0:
#         return np.concatenate([np.zeros((np.abs(begin)), dtype=float), audio[0:end]])
#     else:
#         return audio[begin:end]

# def find_top_notes(fft, num):
#     if np.max(fft.real) < 0.001:
#         return []
#     lst = [x for x in enumerate(fft.real)]
#     lst = sorted(lst, key=lambda x: x[1], reverse=True)
#     idx = 0
#     found = []
#     found_note = set()
#     while (idx < len(lst)) and (len(found) < num):
#         f = xf[lst[idx][0]]
#         y = lst[idx][1]
#         n = freq_to_number(f)
#         n0 = int(round(n))
#         name = indian_note_name(n0)
#         if name not in found_note:
#             found_note.add(name)
#             s = [f, name, y]
#             found.append(s)
#         idx += 1
#     return found

# def freq_to_number(f): return 69 + 12 * np.log2(f / 440.0)
# def number_to_freq(n): return 440 * 2.0 ** ((n - 69) / 12.0)

# # Function to map Western notes to Indian notes with conventions
# def indian_note_name(n):
#     index = n % 12
#     octave = int(n / 12) - 1
#     indian_note = INDIAN_NOTE_NAMES.get(index, "Unknown")
#     if "komal" in indian_note:
#         indian_note = indian_note.replace("komal", "komal ")
#     if "Teevra" in indian_note:
#         indian_note = indian_note.replace("Teevra", "Teevra ")
#     if index == 3:  # Teevra Ma
#         indian_note = "Teevra Ma"
#     return f"{indian_note}{octave}"

# # Hanning window function
# window = 0.5 * (1 - np.cos(np.linspace(0, 2 * np.pi, FFT_WINDOW_SIZE, False)))
# xf = np.fft.rfftfreq(FFT_WINDOW_SIZE, 1 / fs)
# FRAME_COUNT = int(AUDIO_LENGTH * FPS)
# FRAME_OFFSET = int(len(audio) / FRAME_COUNT)

# # Pass 1: Find out the maximum amplitude so we can scale.
# mx = 0
# for frame_number in range(FRAME_COUNT):
#     sample = extract_sample(audio, frame_number)
#     fft = np.fft.rfft(sample * window)
#     fft = np.abs(fft).real
#     mx = max(np.max(fft), mx)

# print(f"Max amplitude: {mx}")

# # Pass 2: Produce the animation and PDF of notations
# pdf_file = "notations.pdf"
# c = canvas.Canvas(pdf_file, pagesize=letter)
# width, height = letter

# # Add notation documentation at the start of the PDF
# c.drawString(50, height - 50, INDIAN_NOTE_CONVENTIONS)
# c.translate(0, -100)  # Move down to start the frame details

# for frame_number in tqdm.tqdm(range(FRAME_COUNT)):
#     sample = extract_sample(audio, frame_number)
#     fft = np.fft.rfft(sample * window)
#     fft = np.abs(fft) / mx
#     s = find_top_notes(fft, TOP_NOTES)
#     fig = plot_fft(fft.real, xf, fs, s, RESOLUTION)
#     fig.write_image(f"frame{frame_number}.png", scale=SCALE)

#     # Add notation info to PDF
#     c.drawString(50, height - 150 - (frame_number * 20), f"Frame {frame_number}: {', '.join(note[1] for note in s)}")

# # Save the PDF
# c.save()

# # Run FFmpeg to create the video
# ffmpeg_command = [
#     'ffmpeg',
#     '-y',  # Overwrite output file if it exists
#     '-r', str(FPS),  # Frame rate
#     '-f', 'image2',  # Input format (image sequence)
#     '-s', '1920x1080',  # Output resolution
#     '-i', 'frame%d.png',  # Input file pattern for images
#     '-i', AUDIO_FILE,  # Input audio file
#     '-c:v', 'libx264',  # Video codec
#     '-pix_fmt', 'yuv420p',  # Pixel format
#     'movie.mp4'  # Output video file
# ]

# # Execute the FFmpeg command
# subprocess.run(ffmpeg_command, check=True)

# print("Video creation complete and PDF of notations generated.")
