In [67]:
import openai
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound
from pytube import Playlist
from collections import defaultdict
import re
from transformers import GPT2Tokenizer
from tqdm import tqdm

In [103]:
openai.api_key = ""

# 1. Get Video URLs

In [27]:
# Definition
def get_video_urls(playlist_url):
    playlist = Playlist(playlist_url)
    playlist._video_regex = re.compile(r"\"url\":\"(/watch\?v=[\w-]*)")

    return [video.watch_url for video in playlist.videos]

In [26]:
# Usage
playlist_url = "https://www.youtube.com/playlist?list=PLc38fcMFcV_s7Lf6xbeRfWYRt7-Vmi_X9"
playlist = get_video_urls(playlist_url)

In [16]:
# Verification
print("Nr. of Videos: " + str(len(playlist)))
print("First Video " + playlist[0])

Nr. of Videos: 130
First Video https://youtube.com/watch?v=9gf2MT-IOsg


# 2. Get Video Transcripts

In [75]:
def get_transcripts(video_urls):
    transcripts = []
    failed_videos = []
    for video_url in tqdm(video_urls, desc='Processing videos'):
        video_id = video_url.split("watch?v=")[1]
        transcript_text = ""

        # List of English language codes in the order they should be tried
        languages = ["en-GB", "en-US", "en"]
        for language in languages:
            try:
                transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
                for text in transcript:
                    transcript_text += ' ' + text['text']
                break
            except NoTranscriptFound:
                continue
        if transcript_text == "":
            # If no transcript could be found in any language, append a failure message to the transcripts list
            transcripts.append(f"No English transcript found for video: {video_url}")
            failed_videos.append(video_url)
        else:
            transcripts.append(transcript_text)
    
    print("Failed videos: ", failed_videos)
    return transcripts

In [37]:
# Usage
transcripts = get_transcripts(playlist)

Failed videos:  []


In [40]:
# Verification
transcripts[0]

' The Sims is a digital dollhouse,\xa0\nfilled with little computer people. And it\'s your job to keep\xa0\nthem alive, keep them happy,\xa0\xa0 and keep their house furnished\xa0\nwith ever more expensive trinkets. But micromanaging the needs and desires of\xa0\nan entire family can get pretty tedious. So the developers at Maxis soon realised that\xa0\xa0 they\'d need to give these guys\xa0\na certain amount of autonomy. Free will. The power to think for themselves, if\xa0\nthe player stops giving direct commands. Which is actually... a pretty complicated task! These characters\xa0\nneed to read as believable humans. And they need to be able to adapt to any\xa0\npossible house you build around them,\xa0\xa0 or any social situation they find themselves in. And - crucially - they can\'t be so smart that\xa0\nthe player can just sit back and do nothing. So how do you make a video game\xa0\nAI that can do all of that? Well,\xa0that\'s what I want to explore in this video. I\'m going to br

## 3. Create Prompts

In [76]:
command = """
Write me a summary in bullet points in markdown.
I don't want the summary to be just a plain wall of bullet points.
I need you to use markdown elements like italics, bold, lists, etc. to make a nice readable experience.

---
"""

In [42]:
# Definition
def prepend_cmd_to_transcripts(cmd, transcripts):
    return [cmd + transcript for transcript in transcripts]

In [43]:
# Usage
transcripts_with_cmd = prepend_cmd_to_transcripts(command, transcripts)

In [44]:
# Verification
transcripts_with_cmd[0]

'\nWrite me a summary in bullet points in markdown.\nI don\'t want the summary to be just a plain wall of bullet points.\nI need you to use markdown elements like italics, bold, lists, etc. to make a nice readable experience.\n\n---\n The Sims is a digital dollhouse,\xa0\nfilled with little computer people. And it\'s your job to keep\xa0\nthem alive, keep them happy,\xa0\xa0 and keep their house furnished\xa0\nwith ever more expensive trinkets. But micromanaging the needs and desires of\xa0\nan entire family can get pretty tedious. So the developers at Maxis soon realised that\xa0\xa0 they\'d need to give these guys\xa0\na certain amount of autonomy. Free will. The power to think for themselves, if\xa0\nthe player stops giving direct commands. Which is actually... a pretty complicated task! These characters\xa0\nneed to read as believable humans. And they need to be able to adapt to any\xa0\npossible house you build around them,\xa0\xa0 or any social situation they find themselves in. 

## 4. Token Count

In the next step we process each prompt.

We now need to make sure that every prompt is within the limit of the model we choose.

In [70]:
def count_tokens(text):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    return len(tokenizer.encode(text, max_length=1_000_000, truncation=True))

In [71]:
def check_prompts(prompts):
    model_limits = {
        "gpt-3.5-turbo": 4096,
        "gpt-3.5-turbo-16k": 16384,
        "gpt-4": 8192,
        "gpt-4-32k": 32768
    }
    results = {}
    for model in model_limits:
        over_limit = 0
        for prompt in tqdm(prompts, desc=f'Processing for model: {model}'):
            if count_tokens(prompt) > model_limits[model]:
                over_limit += 1
        results[model] = over_limit
    return results

In [69]:
# Usage
results = check_prompts(transcripts_with_cmd)
for model, count in results.items():
    print(f"For model '{model}', there are {count} prompts over the token limit.")


Processing for model: gpt-3.5-turbo: 100%|██████████| 130/130 [00:28<00:00,  4.51it/s]
Processing for model: gpt-3.5-turbo-16k: 100%|██████████| 130/130 [00:28<00:00,  4.59it/s]
Processing for model: gpt-4: 100%|██████████| 130/130 [00:27<00:00,  4.64it/s]
Processing for model: gpt-4-32k: 100%|██████████| 130/130 [00:28<00:00,  4.57it/s]

For model 'gpt-3.5-turbo', there are 19 prompts over the token limit.
For model 'gpt-3.5-turbo-16k', there are 0 prompts over the token limit.
For model 'gpt-4', there are 1 prompts over the token limit.
For model 'gpt-4-32k', there are 0 prompts over the token limit.





## 5. Pricing Estimate

The following will output how much it would cost to input each prompt for each model. Output costs are seperate. To get a total approximate cost estimate, multiply by 2.

In [72]:
def calculate_cost(prompts):
    model_pricing = {
        "gpt-3.5-turbo": 0.0015,
        "gpt-3.5-turbo-16k": 0.003,
        "gpt-4": 0.03,
        "gpt-4-32k": 0.06
    }
    total_cost = {}

    for model in model_pricing:
        cost = 0
        for prompt in tqdm(prompts, desc=f'Calculating cost for model: {model}'):
            token_count = count_tokens(prompt)
            cost += token_count * model_pricing[model] / 1000 # cost per 1K tokens
        total_cost[model] = round(cost, 2) # rounding to 2 decimal places for readability

    return total_cost

In [74]:
costs = calculate_cost(transcripts_with_cmd)

for model, cost in costs.items():
    print(f"For model '{model}', it would cost approximately ${cost} to process all prompts.")

Calculating cost for model: gpt-3.5-turbo: 100%|██████████| 130/130 [00:28<00:00,  4.62it/s]
Calculating cost for model: gpt-3.5-turbo-16k: 100%|██████████| 130/130 [00:27<00:00,  4.71it/s]
Calculating cost for model: gpt-4: 100%|██████████| 130/130 [00:29<00:00,  4.43it/s]
Calculating cost for model: gpt-4-32k: 100%|██████████| 130/130 [00:29<00:00,  4.46it/s]

For model 'gpt-3.5-turbo', it would cost approximately $0.57 to process all prompts.
For model 'gpt-3.5-turbo-16k', it would cost approximately $1.15 to process all prompts.
For model 'gpt-4', it would cost approximately $11.49 to process all prompts.
For model 'gpt-4-32k', it would cost approximately $22.99 to process all prompts.





## 6. Ask GPT-4

In [90]:
def ask_gpt(prompts):
    responses = []
    for i, prompt in enumerate(tqdm(prompts, desc='Processing prompts')):
        try:
            response = openai.ChatCompletion.create(
                model = "gpt-4",
                messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt}
                    ]
            )
            responses.append(response['choices'][0]['message']['content'])
            
            # save each response to a new text file
            with open(f"response_{i}.txt", "w") as f:
                f.write(response['choices'][0]['message']['content'])

        except Exception as e:
            print(f"Error with prompt {i}: {e}")
            responses.append(None)
            
    return responses

In [102]:
# BEWARE, EXECUTING THE FOLLOWING CELL COSTS MONEY
# Usage
# responses = ask_gpt(transcripts_with_cmd)

In [99]:
# Verification
responses[0]

"# Summary - The AI of The Sims\n\n**Architecture**\n\n- Maxis' The Sims is a digital dollhouse powered by a complex AI system that gives Sims autonomy and free will to act believably human in any environment.\n\n**Mechanics**\n\n- Sims are driven by eight 'Motives' also called 'Need Systems', these include; Hunger, Hygiene, Fun, etc. These decay at different rates depending on the activities the characters are performing.\n- Sims decision-making is driven by these 'needs'. They don't automatically know which actions fulfill these needs, instead, objects 'advertise' their capabilities to address certain needs.\n- Based on their needs, Sims weigh the advertising scores of objects and perform interactions that yield the most benefits, but they don't pick the best options every time. This helps avoid repetitiveness.\n\n**Unique Features**\n\n- The Sims' AI was inspired by SimAnt's virtual ant colony, where ants move towards attractive pheromones.\n- The game’s design maintains rational be

## 7. Save As PDF

Iterating through every text file and putting them in a pdf, rendered Markdown.

In [101]:
import os
import markdown
from weasyprint import HTML

def markdown_to_html(markdown_text):
    html = markdown.markdown(markdown_text)
    return html

def html_to_pdf(html, output_filename):
    HTML(string=html).write_pdf(output_filename)

# Get a list of all text files in the current directory
files = [f for f in os.listdir() if f.endswith('.txt')]

# Create a string to hold the final HTML
html_string = ''

# Convert each text file to HTML and append it to the string
for file in files:
    with open(file, 'r') as f:
        markdown_text = f.read()
        html_string += markdown_to_html(markdown_text)
        # Adding a page break after each file content.
        html_string += "<div style='page-break-after: always;'></div>"

# Convert the final HTML string to a PDF
html_to_pdf(html_string, 'output.pdf')