# AI Video Editor


## Generating the MP3 from MP4 Video

In [None]:
!ffmpeg -analyzeduration 100M -probesize 100M -i input.mp4 -vn -q:a 0 -map a audio.mp3

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

## Transcription

In [None]:
!pip install groq



In [None]:

import os
from groq import Groq

client = Groq(api_key = "GROQ_API_KEY")
filename = "audio.mp3"

with open(filename, "rb") as file:
    transcription = client.audio.transcriptions.create(
      file=(filename, file.read()),
      model="whisper-large-v3-turbo",
      response_format="verbose_json",
    )
    print(transcription.segments)


[{'id': 0, 'seek': 0, 'start': 0, 'end': 3.12, 'text': " He didn't die, he lived, but he was a really healthy guy.", 'tokens': [50365, 634, 994, 380, 978, 11, 415, 5152, 11, 457, 415, 390, 257, 534, 4627, 2146, 13, 50521], 'temperature': 0, 'avg_logprob': -0.18401876, 'compression_ratio': 1.7934783, 'no_speech_prob': 3.009394e-12}, {'id': 1, 'seek': 0, 'start': 3.14, 'end': 3.58, 'text': ' He was an athlete.', 'tokens': [50522, 634, 390, 364, 18002, 13, 50544], 'temperature': 0, 'avg_logprob': -0.18401876, 'compression_ratio': 1.7934783, 'no_speech_prob': 3.009394e-12}, {'id': 2, 'seek': 0, 'start': 3.58, 'end': 4.64, 'text': ' But he was psyched out the same after his life.', 'tokens': [50544, 583, 415, 390, 4681, 292, 484, 264, 912, 934, 702, 993, 13, 50597], 'temperature': 0, 'avg_logprob': -0.18401876, 'compression_ratio': 1.7934783, 'no_speech_prob': 3.009394e-12}, {'id': 3, 'seek': 0, 'start': 4.64, 'end': 8.86, 'text': ' Yeah, he had knee problems, and he took Vioxx, and all of 

In [None]:
transcribed_text = transcription.text
transcription_segments = transcription.segments

In [None]:
from groq import Groq

client = Groq(api_key="GROQ_API_KEY")

completion = client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=[
        {
            "role": "system",
            "content": "You are an AI assistant analyzing a transcript. Your task is to identify and return only the filler words or unnecessary phrases that can be removed without altering the core meaning of the text. Respond exclusively in a JSON format with a single key: 'filler_words'."
        },
        {
            "role": "user",
            "content": transcribed_text
        }
    ],
    temperature=0.7,
    max_completion_tokens=512,
    top_p=1,
    stream=False,
    response_format={"type": "json_object"},
)

print(completion.choices[0].message)


ChatCompletionMessage(content='{\n   "filler_words": [\n      "Yeah",\n      "I mean",\n      "Like",\n      "So",\n      "You know",\n      "Exactly",\n      "Right",\n      "It\'s like",\n      "I think",\n      "You\'re like",\n      "I\'m like",\n      "That\'s crazy",\n      "It\'s crazy",\n      "Uh",\n      "Oh yeah"\n   ]\n}', role='assistant', function_call=None, reasoning=None, tool_calls=None)


In [None]:
import json

response_content = completion.choices[0].message.content
filler_words_dict = json.loads(response_content)

filler_words = filler_words_dict.get("filler_words", [])
print("Filler Words Identified:\n", filler_words)


Filler Words Identified:
 ['Yeah', 'I mean', 'Like', 'So', 'You know', 'Exactly', 'Right', "It's like", 'I think', "You're like", "I'm like", "That's crazy", "It's crazy", 'Uh', 'Oh yeah']


## Silence Detection

In [None]:
!pip install pydub



In [None]:
from pydub import AudioSegment
from pydub.silence import detect_silence

audio = AudioSegment.from_file("audio.mp3")

silence_ranges = detect_silence(
    audio,
    min_silence_len=600,
    silence_thresh=-40
)


silence_ranges_seconds = [(start / 1000, end / 1000) for start, end in silence_ranges]

print(f"Silence Ranges (seconds): {silence_ranges_seconds}")


✅ Silence Ranges (seconds): [(27.232, 27.936), (34.773, 35.558), (41.197, 41.761), (53.676, 54.298), (60.846, 61.448), (71.516, 72.243), (75.089, 75.734), (87.541, 88.227), (99.366, 100.238), (108.969, 109.473), (111.342, 112.167), (114.264, 115.006), (129.174, 129.676), (142.62, 143.253), (156.453, 157.284), (172.058, 172.582), (182.459, 183.042), (183.341, 183.873), (186.732, 187.518), (235.719, 236.473), (242.555, 243.144), (247.644, 248.617), (253.182, 253.877), (256.208, 256.782), (258.385, 259.102), (266.722, 267.356), (277.409, 277.992), (291.494, 291.998), (298.094, 298.717)]


## Trim Ranges (Silence + Filler words)

In [None]:
import re

trim_points = []

for segment in transcription_segments:
    text = segment["text"]
    start_time = segment["start"]
    end_time = segment["end"]

    for filler in filler_words:
        for match in re.finditer(r'\b' + re.escape(filler) + r'\b', text, re.IGNORECASE):
            match_start = match.start() / len(text)
            match_end = match.end() / len(text)

            filler_start_time = start_time + (match_start * (end_time - start_time))
            filler_end_time = start_time + (match_end * (end_time - start_time))

            trim_points.append((filler_start_time, filler_end_time))


all_trim_ranges = sorted(trim_points + silence_ranges_seconds)

print(f"Trim Points (Filler Words): {trim_points}")
print(f"All Trim Ranges (Filler + Silence): {all_trim_ranges}")


✅ Trim Points (Filler Words): [(4.6848936170212765, 4.864468085106383), (10.357254901960784, 10.487450980392156), (10.692533333333333, 10.920266666666667), (22.58551724137931, 22.904137931034484), (24.012063492063493, 24.22031746031746), (30.0612987012987, 30.796883116883116), (38.78752688172043, 38.97763440860215), (57.249756097560976, 57.47414634146342), (53.996097560975606, 54.10829268292683), (56.969268292682926, 57.47414634146342), (66.98849557522124, 67.26743362831859), (70.04563683544305, 70.3081841772152), (74.96839949367089, 75.23094683544305), (75.82167835443039, 76.21549936708861), (72.14601556962026, 72.40856291139241), (77.65950974683545, 78.18460443037975), (74.77148898734178, 75.23094683544305), (81.34561000000001, 81.56561), (82.06061, 82.28061), (84.30907153846154, 84.54291769230768), (91.61676384615384, 91.85061), (84.65984076923077, 85.12753307692307), (96.10394333333333, 96.47727666666667), (97.29499356164384, 97.6347195890411), (98.14430863013698, 98.48403465753424

## Trimming Audio using Librosa

In [None]:
import librosa
import numpy as np
import soundfile as sf


y, sr = librosa.load("audio.mp3", sr=None)


samples_to_remove = []
for start_time, end_time in all_trim_ranges:
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    samples_to_remove.append((start_sample, end_sample))

audio_trimmed = []
last_end = 0

for start_sample, end_sample in samples_to_remove:
    audio_trimmed.append(y[last_end:start_sample])
    last_end = end_sample

audio_trimmed.append(y[last_end:])

audio_trimmed = np.concatenate(audio_trimmed)
sf.write("cleaned_audio.mp3", audio_trimmed, sr)

print("Audio Processing Complete! Cleaned file saved as 'cleaned_audio.mp3'")


✅ Audio Processing Complete! Cleaned file saved as 'cleaned_audio.mp3'


## Trimming the Whole Video

In [None]:
import subprocess

def trim_video(input_path, output_path, keep_ranges):
    filter_complex = []
    inputs = []

    for start, end in keep_ranges:
        inputs.append(f"between(t,{start},{end})")

    filter_str = f"select='{'+'.join(inputs)}',setpts=N/FRAME_RATE/TB"

    cmd = [
        "ffmpeg",
        "-i", input_path,
        "-vf", filter_str,
        "-af", f"aselect='{'+'.join(inputs)}',asetpts=N/SR/TB",
        "-c:v", "libx264",
        "-preset", "slow",
        "-crf", "18",
        "-c:a", "aac",
        "-b:a", "192k",
        output_path
    ]

    subprocess.run(cmd)


keep_ranges = []
current_start = 0

all_trim_ranges.sort()

for start, end in all_trim_ranges:
    keep_ranges.append((current_start, start))
    current_start = end

if current_start < audio.duration_seconds:
    keep_ranges.append((current_start, audio.duration_seconds))

keep_ranges = [r for r in keep_ranges if r[0] < r[1]]

print(f"Keep Ranges: {keep_ranges}")
trim_video("input.mp4", "output.mp4", keep_ranges)


✅ Keep Ranges: [(0, 4.6848936170212765), (4.864468085106383, 10.357254901960784), (10.487450980392156, 10.692533333333333), (10.920266666666667, 22.58551724137931), (22.904137931034484, 24.012063492063493), (24.22031746031746, 27.232), (27.936, 30.0612987012987), (30.796883116883116, 34.773), (35.558, 38.78752688172043), (38.97763440860215, 41.197), (41.761, 53.676), (54.10829268292683, 56.969268292682926), (57.47414634146342, 60.846), (61.448, 66.98849557522124), (67.26743362831859, 70.04563683544305), (70.3081841772152, 71.516), (72.40856291139241, 74.77148898734178), (75.734, 75.82167835443039), (76.21549936708861, 77.65950974683545), (78.18460443037975, 81.34561000000001), (81.56561, 82.06061), (82.28061, 84.30907153846154), (84.54291769230768, 84.65984076923077), (85.12753307692307, 87.541), (88.227, 91.61676384615384), (91.85061, 96.10394333333333), (96.47727666666667, 96.95526753424657), (97.6347195890411, 97.88951410958904), (98.05937712328767, 98.14430863013698), (98.484034657