# Clean the text essays

In [4]:
# read ~/Downloads/pg.csv
import pandas as pd

df = pd.read_csv('~/Downloads/pg.csv')
len(set(df['essay_url']))

215

In [29]:
import os
import re
from IPython.display import Markdown, display

# Get a list of all files in the directory
files = os.listdir('./graham-essays/essays')


# Iterate over each file
for idx, file_name in enumerate(files):
    # Skip non .md files
    if not file_name.endswith('.md'):
        continue

    # Load the text
    with open(f'./graham-essays/essays/{file_name}', 'r') as file:
        text = file.read()

    # Extract notes
    notes = re.findall(r'<a name=(.*?)>\[(.*?)\]</a> (.*?)\n', text)

    # Reformat title
    title = re.search(r'# (.*?)\n', text).group(1)
    number, name = title.split(' ', 1)
    text = text.replace(f'# {title}', f'Essay #{number}: {name}')

    # Replace every occurrence of "rev." with "revised"
    text = text.replace('rev.', 'revised')

    # Remove image links, typically at the beginning as banners
    text = re.sub(r'\[\]\(.*?\)\s*', '', text)

    # Find all quoted paragraphs with many ">" and reformat them with quadruple quotes
    text = re.sub(r'( > .*\n)+', lambda match: '""""' + match.group(0).replace(' > ', ' ').replace('\n', ' ') + '""""\n', text)

    # Replace note references in the text with the parenthetical notes
    for note in notes:
        note_name, note_number, note_content = note
        text = text.replace(f'[{note_number}](#{note_name})', f'(SIDENOTE: {note_content.strip()})')

    # Remove the notes section
    notes_start = text.find('**Notes**')
    notes_end = text.find('**Thanks**')
    text = text[:notes_start] + text[notes_end:]

    # Remove translation links
    text = re.sub(r'\[.*?Translation.*?\]\(.*?\)\s*\n', '', text, flags=re.IGNORECASE)

    # Remove lines containing "* * *"
    text = re.sub(r'\* \* \*\n', '', text)

    # Save the updated text
    cleaned_file_name = file_name.replace('.md', '_cleaned.md')
    with open(f'./cleaned-essays/{cleaned_file_name}', 'w') as file:
        file.write(text)

    # pretty-print in jupyter notebook
    # display(Markdown(text))
    
    if idx == 10:
        break


# Get more PG audio from The Social Radars

In [21]:
import json                                                                       
                                                                                    
# Load the JSON file                                                              
data = json.load(open('social-radars-diarize.json'))                               
                                                                                    
# Extract the timestamps for speaker B (PG, upon inspection)
pg_segments = [(segment['start'],segment['stop']) for segment in data['segments'] if segment['speaker'] == 'B']



In [18]:
from pydub import AudioSegment

# Load the audio file
audio = AudioSegment.from_mp3("./social-radars-pg.mp3")

# Initialize an empty audio segment
combined = AudioSegment.empty()

# Loop through the segments
print(pg_segments)
for start, stop in pg_segments:
    # Convert timestamps to milliseconds
    start_milliseconds = sum(float(x) * 60 ** i for i,x in enumerate(reversed(start.split(":")))) * 1000
    stop_milliseconds = sum(float(x) * 60 ** i for i,x in enumerate(reversed(stop.split(":")))) * 1000
    
    # Extract segment from the audio
    segment = audio[start_milliseconds:stop_milliseconds]

    # Append the segment to the combined audio
    combined += segment

# Export the combined audio
combined.export("pg_segments.mp3", format="mp3")

# Generate YouTube video stuff

In [22]:
import subprocess
audio_file = './audio/founder_visa_1.mp3'
output_file = './video/waveforms/founder_visa_1.mp4'

rgb_color = '0.9960784314,0.4823529412,0.1490196078' # YC Orange!!


subprocess.run(["seewav", "--color", rgb_color, audio_file, output_file])

Generating the frames...


100%|█████████████████████████████████| 8169/8169 [00:34<00:00, 237.80 frames/s]


Encoding the animation video... 


CompletedProcess(args=['seewav', '--color', '0.9960784314,0.4823529412,0.1490196078', './audio/founder_visa_1.mp3', './video/waveforms/founder_visa_1.mp4'], returncode=0)

In [17]:
TextClip.list('font')

['AvantGarde-Book',
 'AvantGarde-BookOblique',
 'AvantGarde-Demi',
 'AvantGarde-DemiOblique',
 'Bookman-Demi',
 'Bookman-DemiItalic',
 'Bookman-Light',
 'Bookman-LightItalic',
 'Courier-BoldOblique',
 'fixed',
 'Helvetica-BoldOblique',
 'Helvetica-Narrow',
 'Helvetica-Narrow-Bold',
 'Helvetica-Narrow-BoldOblique',
 'Helvetica-Narrow-Oblique',
 'NewCenturySchlbk-Bold',
 'NewCenturySchlbk-BoldItalic',
 'NewCenturySchlbk-Italic',
 'NewCenturySchlbk-Roman',
 'Palatino-BoldItalic',
 'Palatino-Roman',
 'Times-BoldItalic',
 '.Al-Bayan-PUA-Bold',
 '.Al-Bayan-PUA-Plain',
 '.Al-Nile-PUA',
 '.Al-Nile-PUA-Bold',
 '.Al-Tarikh-PUA',
 '.Apple-Color-Emoji-UI',
 '.Apple-SD-Gothic-NeoI-Bold',
 '.Apple-SD-Gothic-NeoI-ExtraBold',
 '.Apple-SD-Gothic-NeoI-Heavy',
 '.Apple-SD-Gothic-NeoI-Light',
 '.Apple-SD-Gothic-NeoI-Medium',
 '.Apple-SD-Gothic-NeoI-Regular',
 '.Apple-SD-Gothic-NeoI-SemiBold',
 '.Apple-SD-Gothic-NeoI-Thin',
 '.Apple-SD-Gothic-NeoI-UltraLight',
 '.Aqua-Kana',
 '.Aqua-Kana-Bold',
 '.Arial-He

In [79]:
from moviepy.editor import *
from moviepy.video.tools.drawing import color_gradient

# Load the waveform video
waveform_clip = VideoFileClip("./video/waveforms/founder_visa_1.mp4") #.subclip(0, 1) # Limit the clip seconds for testing

# Resize the waveform to fit the bottom of the 1080p resolution
# waveform_clip = waveform_clip.resize(height=270) # 1/4 of 1080p height

# Create a black background
background = ColorClip((1920, 1080), col=[0,0,0]).set_duration(waveform_clip.duration)

# Create the title text
title_text = TextClip("The Founder Visa", fontsize=70, color='white', font='.SF-NS-Rounded-Bold').set_duration(waveform_clip.duration)
year_text = TextClip("April 2009", fontsize=50, color='white', font='.SF-NS-Rounded-Regular-G1').set_duration(waveform_clip.duration)

subtitle_text = TextClip("read by an AI clone of Paul Graham", fontsize=50, color='white', font='.SF-NS-Rounded-Regular-G1').set_duration(waveform_clip.duration)

VERTICAL_OFFSET = 100
# Position the title text on the left-hand side
title_text = title_text.set_position((200, 700+VERTICAL_OFFSET))
year_text = year_text.set_position((200, 780+VERTICAL_OFFSET))

subtitle_text = subtitle_text.set_position((1000, 780+VERTICAL_OFFSET))

# Position the waveform at the bottom
waveform_clip = waveform_clip.set_position(('center', 250+VERTICAL_OFFSET))

# Overlay the waveform and title text on the black background
final_clip = CompositeVideoClip([background, waveform_clip, title_text, subtitle_text, year_text]).set_duration(waveform_clip.duration)

# Write the final clip to a file
# Set duration in write_videofile equal to the duration of the waveform_clip
final_clip.write_videofile("composite.mp4", codec='libx264') # , duration=waveform_clip.duration)


Moviepy - Building video composite.mp4.
MoviePy - Writing audio in compositeTEMP_MPY_wvf_snd.mp3


                                                                      

MoviePy - Done.
Moviepy - Writing video composite.mp4



                                                                

Moviepy - Done !
Moviepy - video ready composite.mp4




In [30]:
!convert --version

Version: ImageMagick 7.1.1-21 Q16-HDRI aarch64 21667 https://imagemagick.org
Copyright: (C) 1999 ImageMagick Studio LLC
License: https://imagemagick.org/script/license.php
Features: Cipher DPC HDRI Modules OpenMP(5.0) 
Delegates (built-in): bzlib fontconfig freetype gslib heic jng jp2 jpeg jxl lcms lqr ltdl lzma openexr png ps raw tiff webp xml zlib
Compiler: gcc (4.2)
