In [1]:
import subprocess
from dotenv import load_dotenv
load_dotenv()

def run_shell_command(command):
    """
    Run a shell command and return the output.

    Args:
    - command (str): The shell command to execute.

    Returns:
    - str: The output of the command.
    """
    try:
        result = subprocess.run(
            command,
            shell=True,
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )
        print(result.stdout.strip())
        return result.stdout.strip()

    except subprocess.CalledProcessError as e:
        # If the command fails, you might want to handle the error
        print(f"Command Failed. Error Message: {e.stderr.strip()}")
        return None

In [2]:
from faster_whisper import WhisperModel

model_size = "medium"
model = WhisperModel(model_size, compute_type="float32")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dummy = False

In [4]:
#@markdown # **Content Generation** 🚀

import os, json, re, random
from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key = os.getenv('OPENAI_API_KEY'),
)

topics = [
    "space",
    "planets",
    "universe",
    "science",
    "history",
    "technology",
    "books",
    "quotes",
    "reading",
    "stoicism",
    "travel",
    "horror",
    "war",
    "meditation",
    "book summaries (philosophy)",
    "DIY tips"
]

random_topic = random.choice(topics)
if dummy == False:
    chat_completion = client.chat.completions.create(
        messages = [
            {
                "role": "system",
                "content": "You are an expert short form video content generator, which is very cusious to listen to, user get's deeply involved in your generated content.",
            },
            {
                "role": "user",
                "content": "Generate a very curious/ mind boggling fact on topic: " + random_topic + ", containing approx 550 characters long story for short form 1 min long video. Include a very curious hook in the opening statements of the audioScript, so that usera are hooked to what's next. Also use appropriate punctuations wherever necessary which can be picked up by TTS engine. The script you provide will be converted to speech using TTS, so give your write it accordingly, with appropriate punctuations. Also return seo title, seo description and seo hashtags for youtube uploads. Return your answer strictly in this json format: { 'script': '', seoTitle: '', seoDescription: '', seoHashtags: '' }"
            }
        ],
        model = "gpt-3.5-turbo-1106",
        response_format = { "type": "json_object" },
    )

    content = json.loads(chat_completion.choices[0].message.content)
    audioScript = re.sub(r'#[a-zA-Z0-9_]+', '', content.get("script"))
    
else :
    content = {
    "script": "Did you know that the universe is expanding at a mind-boggling rate? In fact, new observations suggest that the expansion of the universe may be accelerating. This goes against the traditional understanding of gravity and raises profound questions about the nature of space, time, and the cosmos. Scientists believe that a mysterious force called dark energy may be responsible for this cosmic acceleration, but its true nature remains one of the greatest mysteries in astrophysics. Imagine a force that is pushing all the galaxies in the universe away from each other at an ever-increasing speed, creating a space that's expanding faster than our ability to comprehend. It's a concept that challenges our fundamental understanding of the universe and opens up a realm of possibilities that are both exhilarating and perplexing. As we continue to unravel the secrets of space, the mysteries of cosmic expansion remind us that there is so much more to the universe than meets the eye.",

    "seoTitle": "Mind-Boggling Space Fact: The Universe's Mysterious Expansion",

    "seoDescription": "Discover the mind-boggling truth about the universe's accelerating expansion and the enigmatic force of dark energy. Explore the mysteries of space and cosmic evolution!",

    "seoHashtags": "#SpaceFacts #CosmicExpansion #DarkEnergyMystery #Astrophysics"
}
seoTitle = content.get("seoTitle")
seoHashtags = content.get("seoHashtags")
seoDescription = content.get("seoDescription")
videoTags = []

fallBackTag = random_topic.split(" ")[0]

print(json.dumps(content, indent=4))


{
    "script": "Did you know that the Great Wall of China is not actually visible from the moon? Yes, you heard it right! Despite the popular myth, astronauts have confirmed that the wall is not easily distinguishable from the lunar surface. The curvature of the Earth and the vastness of space make it nearly impossible to see any man-made structure with the naked eye. So, next time someone tells you that the Great Wall is the only man-made structure visible from space, you can share this mind-boggling fact with them. Pretty fascinating, isn't it?",
    "seoTitle": "Mind-Blowing Fact about the Great Wall of China | Not Visible from the Moon",
    "seoDescription": "Discover the truth about the Great Wall of China and its visibility from the moon. Explore more mind-boggling travel facts with us!",
    "seoHashtags": "#GreatWall #TravelFacts #MindBlowingDiscoveries"
}


In [5]:
#@markdown # **Audio File (TTS)** 🚀

import os
from datetime import datetime
from openai import OpenAI
from IPython.display import Audio

client = OpenAI(
    # This is the default and can be omitted
    api_key = os.getenv('OPENAI_API_KEY'),
)

audioPath = "./assets/audios/ai_audio.mp3"
audiofilename = audioPath

if dummy == False:
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy",
        input=audioScript,
    )
    response.stream_to_file(audioPath)

Audio(audioPath)

https://github.com/guillaumekln/faster-whisper

In [6]:
import json

segments, info = model.transcribe(audioPath, word_timestamps=True)
segments = list(segments)  # The transcription will actually run here.
for segment in segments:
    for word in segment.words:
        print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))

wordlevel_info = []

for segment in segments:
    for word in segment.words:
      wordlevel_info.append({'word':word.word,'start':word.start,'end':word.end})

modified_wordlevel_info = []
for word_info in wordlevel_info:
    modified_wordlevel_info.append({'start': word_info['start'], 'end': word_info['end'], 'word': word_info['word'].strip()})

# with open('./assets/files/data.json', 'w') as f:
#     json.dump(modified_wordlevel_info, f,indent=4)



Estimating duration from bitrate, this may be inaccurate


[0.00s -> 0.20s]  Did
[0.20s -> 0.38s]  you
[0.38s -> 0.58s]  know
[0.58s -> 0.78s]  that
[0.78s -> 0.98s]  the
[0.98s -> 1.16s]  Great
[1.16s -> 1.34s]  Wall
[1.34s -> 1.50s]  of
[1.50s -> 1.84s]  China
[1.84s -> 2.14s]  is
[2.14s -> 2.44s]  not
[2.44s -> 2.96s]  actually
[2.96s -> 3.40s]  visible
[3.40s -> 3.66s]  from
[3.66s -> 3.82s]  the
[3.82s -> 4.12s]  moon?
[4.76s -> 5.16s]  Yes,
[5.70s -> 5.76s]  you
[5.76s -> 5.96s]  heard
[5.96s -> 6.18s]  it
[6.18s -> 6.62s]  right.
[7.22s -> 7.48s]  Despite
[7.48s -> 7.74s]  the
[7.74s -> 8.12s]  popular
[8.12s -> 8.64s]  myth,
[9.24s -> 9.56s]  astronauts
[9.56s -> 9.86s]  have
[9.86s -> 10.32s]  confirmed
[10.32s -> 10.50s]  that
[10.50s -> 10.70s]  the
[10.70s -> 10.84s]  wall
[10.84s -> 11.08s]  is
[11.08s -> 11.30s]  not
[11.30s -> 11.80s]  easily
[11.80s -> 12.60s]  distinguishable
[12.60s -> 12.82s]  from
[12.82s -> 12.98s]  the
[12.98s -> 13.22s]  lunar
[13.22s -> 13.86s]  surface.
[14.34s -> 14.52s]  The
[14.52s -> 14.92s]  curva

In [7]:
def split_text_into_lines(data):

    MaxChars = 15
    #maxduration in seconds
    MaxDuration = 2.5
    #Split if nothing is spoken (gap) for these many seconds
    MaxGap = 1.5

    subtitles = []
    line = []
    line_duration = 0
    line_chars = 0


    for idx,word_data in enumerate(data):
        word = word_data["word"]
        start = word_data["start"]
        end = word_data["end"]

        line.append(word_data)
        line_duration += end - start

        temp = " ".join(item["word"] for item in line)


        # Check if adding a new word exceeds the maximum character count or duration
        new_line_chars = len(temp)

        duration_exceeded = line_duration > MaxDuration
        chars_exceeded = new_line_chars > MaxChars
        if idx>0:
          gap = word_data['start'] - data[idx-1]['end']
          # print (word,start,end,gap)
          maxgap_exceeded = gap > MaxGap
        else:
          maxgap_exceeded = False


        if duration_exceeded or chars_exceeded or maxgap_exceeded:
            if line:
                subtitle_line = {
                    "word": " ".join(item["word"] for item in line),
                    "start": line[0]["start"],
                    "end": line[-1]["end"],
                    "textcontents": line
                }
                subtitles.append(subtitle_line)
                line = []
                line_duration = 0
                line_chars = 0


    if line:
        subtitle_line = {
            "word": " ".join(item["word"] for item in line),
            "start": line[0]["start"],
            "end": line[-1]["end"],
            "textcontents": line
        }
        subtitles.append(subtitle_line)

    return subtitles
    
linelevel_subtitles = split_text_into_lines(modified_wordlevel_info)


In [8]:
import requests, random
import urllib.request, time, json

from datetime import datetime
from openai import OpenAI

# print(json.dumps(linelevel_subtitles, indent=4))

transcript = "I have a transcript of a video below in this format: start: <start_time>, end: <end_time>, line: <line_text>\n\n"
for entry in linelevel_subtitles:
    transcript += f"start: {entry['start']}s , end: {entry['end']}s , line: {entry['word']}\n"

transcript += "\n\n\nI wwant to generate strictly some tags for this video transcript, each 3s long for he entire duration of the video that will be further used to generate stock footages for this video. Make sure to keep the time duration between tags same Return video tags in this json format: { 'tags': [{ start: <start_time>, end: <end_time>, tags: '' }] } \n\n"

client = OpenAI(
    # This is the default and can be omitted
    api_key = os.getenv('OPENAI_API_KEY'),
)

chat_completion = client.chat.completions.create(
  model="gpt-3.5-turbo-1106",
  response_format = { "type": "json_object" },
  messages=[
    {"role": "user", "content": transcript}
  ]
)

videoTags = json.loads(chat_completion.choices[0].message.content)['tags']

In [9]:

url = "https://api.pexels.com/videos/search"
headers = {
    "Authorization": "aZB4nryvsXVSv6T6EUWmf4flWHX1ZPestuRD0OQ91FgEL5H9XuRxnxHH"
}

# Collect video links for each tags per sentence
for entry in videoTags:
  print(entry)
  tags = []
  tags.append(entry['tags'])
  # tags = entry['tags'].replace("_", " ").split(",")
  tags.append(fallBackTag)

  print(tags)
  videos = []
  for tag in tags:
    tagVideos = requests.get(url, headers=headers, params={
                              "query": tag,
                              "per_page": 80
                              }).json().get('videos')
    firstVideoWidth = 0
    firstVideoHeight = 0
    for video in tagVideos:
      for video in video.get("video_files"):
        aspectRatio = video.get("width") / video.get("height")
        if aspectRatio < 1:
          if firstVideoWidth == 0 or firstVideoHeight == 0 :
            firstVideoWidth = video.get("width")
            firstVideoHeight = video.get("height")
            videos.append(video)
          else :
            if firstVideoWidth == video.get("width") and firstVideoHeight == video.get("height"):
              videos.append(video)
  print("video len: ", len(videos))
  entry['video'] = videos[random.randint(0, len(videos) -1)]
  videoPath = f"./assets/videos/stock_video_{int(time.time())}.mp4"
  urllib.request.urlretrieve(entry['video']['link'], videoPath)
  entry['video']['path'] = videoPath

# print(json.dumps(videoTags, indent=4))

{'start': '0.0s', 'end': '3.0s', 'tags': 'Great Wall of China'}
['Great Wall of China', 'travel']
video len:  20
{'start': '3.0s', 'end': '6.0s', 'tags': 'Moon visibility'}
['Moon visibility', 'travel']
video len:  10
{'start': '6.0s', 'end': '9.0s', 'tags': 'Astronaut confirmation'}
['Astronaut confirmation', 'travel']
video len:  13
{'start': '9.0s', 'end': '12.0s', 'tags': 'Lunar surface'}
['Lunar surface', 'travel']
video len:  17
{'start': '12.0s', 'end': '15.0s', 'tags': 'Earth curvature'}
['Earth curvature', 'travel']
video len:  4
{'start': '15.0s', 'end': '18.0s', 'tags': 'Space vastness'}
['Space vastness', 'travel']
video len:  12
{'start': '18.0s', 'end': '21.0s', 'tags': 'Naked eye visibility'}
['Naked eye visibility', 'travel']
video len:  13
{'start': '21.0s', 'end': '24.0s', 'tags': 'Man-made structures'}
['Man-made structures', 'travel']
video len:  10
{'start': '24.0s', 'end': '27.0s', 'tags': 'Visible from space'}
['Visible from space', 'travel']
video len:  6
{'star

# Merge Video and audio files

In [10]:
import time

audioDuration = int(float(run_shell_command('ffprobe -i assets/audios/ai_audio.mp3 -show_entries format=duration -v quiet -of csv="p=0"'))) + 2

dummy = False
for i, video in enumerate(videoTags):
    videoPath = video['video']['path']
    videoDuration = float(audioDuration if i == len(videoTags) - 1 else video['end'].replace('s', "")) - float(video['start'].replace('s', ""))
    newVideoPath = f"assets/videos/stock_video_{int(time.time())}.mp4"
    print(f"Trimming Video {i+1} to {videoDuration}s...")
    run_shell_command(f"ffmpeg -i {videoPath} -ss 00 -to {videoDuration} -c:a copy -y {newVideoPath}")
    video['video']['newPath'] = newVideoPath
    print("Done...\n")

if dummy == False:

    mergeVideoCommand = "ffmpeg "

    for video in videoTags:
        videoPath = video['video']['newPath']
        mergeVideoCommand += f"-i {videoPath} "

    mergeVideoCommand += f'-i ./assets/audios/ai_audio.mp3 -i ./assets/audios/bg_audio.mp3 -filter_complex "'

    for i, video in enumerate(videoTags):
        mergeVideoCommand += f"[{i}:v]scale=1080:1920[v{i}];"
    
    for i, video in enumerate(videoTags):
        mergeVideoCommand += f"[v{i}]"

    # Audio duration has to be 59s, since it is a youtube short
    audioDuration = audioDuration if audioDuration < 59 else 59
    mergeVideoCommand += f'concat=n={len(videoTags)}:v=1:a=0[outv];[{len(videoTags)}:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo[vaudio];[{len(videoTags)+1}:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo[vbackground];[vbackground]volume=0.1[vb];[vaudio][vb]amix=inputs=2:duration=longest[a]" -map "[outv]" -map "[a]" -vsync vfr -ss 00 -to {audioDuration} -crf 24 -y assets/videos/trimmed_video.mp4'

    print("Merge Command: ", mergeVideoCommand)

    run_shell_command(mergeVideoCommand)

output_video_path = "./assets/videos/trimmed_video.mp4"
print(f"Combined video saved to: {output_video_path}")

33.144000
Trimming Video 1 to 3.0s...

Done...

Trimming Video 2 to 3.0s...

Done...

Trimming Video 3 to 3.0s...

Done...

Trimming Video 4 to 3.0s...

Done...

Trimming Video 5 to 3.0s...

Done...

Trimming Video 6 to 3.0s...

Done...

Trimming Video 7 to 3.0s...

Done...

Trimming Video 8 to 3.0s...


# DESCRIPT LIKE SUBTITLES

In [31]:
from moviepy.editor import TextClip, CompositeVideoClip, ColorClip
import numpy as np

import textwrap
from PIL import ImageFont

def soft_wrap_text(
    text: str, 
    fontsize: int, 
    letter_spacing: int, 
    font_family: str, 
    max_width: int,
):
    # Note that font_family has to be an absolut path to your .ttf/.otf
    image_font = ImageFont.truetype(font_family, fontsize) 

    # I am not sure my letter spacing calculation is accurate
    text_width = image_font.getlength(text) + (len(text)-1) * letter_spacing 
    letter_width = text_width / len(text)

    if text_width < max_width:
        return text

    max_chars = max_width / letter_width
    wrapped_text = textwrap.fill(text, width=max_chars)
    return wrapped_text

def create_caption(textJSON, framesize, font = "Bevan Regular", color="white", bgcolor="yellow", stroke_color="black",stroke_width=4):
    wordcount = len(textJSON['textcontents'])
    full_duration = textJSON['end']-textJSON['start']

    word_clips = []
    xy_textclips_positions =[]
    
    x_pos = 0
    y_pos = 0
    # max_height = 0
    frame_width = framesize[0]
    frame_height = framesize[1]

    x_buffer = frame_width*1/12
    y_buffer = frame_height*1/2

    fontsize = int(frame_height * 0.035) #3.5 percent of video height

    space_width = ""
    space_height = ""

    for index,wordJSON in enumerate(textJSON['textcontents']):
      duration = wordJSON['end']-wordJSON['start']

      # TextClip

      wrap_title = soft_wrap_text(
          wordJSON['word'],
          font_family="/usr/share/fonts/truetype/Bevan/Bevan-Regular.ttf",
          fontsize=fontsize,
          letter_spacing=12,
          max_width=frame_width * .8  # *0.8 for some padding
      )


      word_clip = TextClip(" " + wrap_title + " ", font = font,fontsize=fontsize, color=color, stroke_color=stroke_color,stroke_width=stroke_width, align="center").set_start(textJSON['start']).set_duration(full_duration)
      word_clip_space = TextClip(" ", font = font,fontsize=fontsize, color=color).set_start(textJSON['start']).set_duration(full_duration)
      word_width, word_height = word_clip.size
      space_width,space_height = word_clip_space.size

      # Uncomment if adding a space text clip
      if x_pos + word_width + space_width > frame_width-2*x_buffer:
            # Move to the next line
            x_pos = 0
            y_pos = y_pos+ word_height+40

            # Store info of each word_clip created
            xy_textclips_positions.append({
                "x_pos":x_pos+x_buffer,
                "y_pos": y_pos+y_buffer,
                "width" : word_width,
                "height" : word_height,
                "word": wordJSON['word'],
                "start": wordJSON['start'],
                "end": wordJSON['end'],
                "duration": duration
            })

            word_clip = word_clip.set_position((x_pos+x_buffer, y_pos+y_buffer))
            word_clip_space = word_clip_space.set_position((x_pos+ word_width +x_buffer, y_pos+y_buffer))
            x_pos = word_width + space_width
      else:
            # Store info of each word_clip created
            xy_textclips_positions.append({
                "x_pos":x_pos+x_buffer,
                "y_pos": y_pos+y_buffer,
                "width" : word_width,
                "height" : word_height,
                "word": wordJSON['word'],
                "start": wordJSON['start'],
                "end": wordJSON['end'],
                "duration": duration
            })

            word_clip = word_clip.set_position((x_pos+x_buffer, y_pos+y_buffer))
            word_clip_space = word_clip_space.set_position((x_pos+ word_width+ x_buffer, y_pos+y_buffer))

            x_pos = x_pos + word_width + space_width

      word_clips.append(word_clip)
      word_clips.append(word_clip_space)


    for highlight_word in xy_textclips_positions:
      wrap_title = soft_wrap_text(
          highlight_word['word'],
          font_family="/usr/share/fonts/truetype/Bevan/Bevan-Regular.ttf",
          fontsize=fontsize,
          letter_spacing=12,
          max_width=frame_width * .8  # *0.8 for some padding
      )
      word_clip_highlight = TextClip(" " + wrap_title + " ", font=font, fontsize=fontsize, color=color, bg_color = bgcolor, stroke_color=stroke_color,stroke_width=stroke_width, align="center").set_start(highlight_word['start']).set_duration(highlight_word['duration']).set_position("center")
      word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos']))
      word_clips.append(word_clip_highlight)

    return word_clips

In [32]:
from moviepy.editor import TextClip, CompositeVideoClip, concatenate_videoclips,VideoFileClip, ColorClip

# Load the input video
input_video = VideoFileClip(output_video_path)
frame_size = input_video.size

all_linelevel_splits=[]

for line in linelevel_subtitles:
  out = create_caption(line,frame_size)
  all_linelevel_splits.extend(out)

# Get the duration of the input video
input_video_duration = input_video.duration
# Create a color clip with the given frame size, color, and duration
# background_clip = ColorClip(size=frame_size, color=(255, 154, 172)).set_duration(input_video_duration)

# If you want to overlay this on the original video uncomment this and also change frame_size, font size and color accordingly.
final_video = CompositeVideoClip([input_video] + all_linelevel_splits).set_position("center")

# final_video = CompositeVideoClip([background_clip] + all_linelevel_splits)

# Set the audio of the final video to be the same as the input video
final_video = final_video.set_audio(input_video.audio)

# Save the final clip as a video file with the audio included
final_video.write_videofile("output.mp4", fps=30, codec="libx264", audio_codec="aac", ffmpeg_params=['-pix_fmt', 'yuv420p'])


Moviepy - Building video output.mp4.
MoviePy - Writing audio in outputTEMP_MPY_wvf_snd.mp4


                                                                      

MoviePy - Done.
Moviepy - Writing video output.mp4



                                                                

Moviepy - Done !
Moviepy - video ready output.mp4


In [22]:
# run_shell_command('ffmpeg -i ./assets/videos/stock_video_1.mp4 -ss 00 -to 10 -c:a copy -y assets/videos/stock_video_1.mp4')




''

## Youtube Upload

In [None]:
# run_shell_command("ffmpeg -i " + reelPath + " -frames:v 1 -ss 3 -f image2  -y assets/images/yt_thumbnail.png")
# import json
# with open('./assets/files/yt_upload_args.json', 'w') as f:
#     json.dump({
#         "title": seoTitle.split("|")[0].strip(),
#         "description": seoDescription,
#         "tags": seoHashtags,
#         "videoFilePath": reelPath,
#         "thumbFilePath": "assets/images/yt_thumbnail.png"
#     }, f, indent=4)
# run_shell_command("node utils/youtube-upload.js run")