In [75]:
import os
import sys
from tqdm import tqdm
from dotenv import load_dotenv
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi

from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [29]:
load_dotenv()

api_key = os.getenv("GCP_API_KEY")
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

channel_name = "@hubermanlab"

In [13]:
class YouTubeHelper:
    
    def __init__(self, api_key):
        self.api_key = api_key
        self.youtube = build('youtube', 'v3', developerKey=api_key)

    def get_channel_id(self, channel_name):
        request = self.youtube.search().list(
            q=channel_name,
            type='channel',
            part='id',
            maxResults=1
        )
        response = request.execute()
        if response['items']:
            return response['items'][0]['id']['channelId']
        else:
            return None

    def get_video_ids(self, channel_id):
        channel_response = self.youtube.channels().list(
            id=channel_id,
            part='contentDetails'
        ).execute()
        upload_playlist_id = channel_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
        videos = []
        next_page_token = None
        while True:
            playlist_response = self.youtube.playlistItems().list(
                playlistId=upload_playlist_id,
                part='contentDetails',
                maxResults=50,
                pageToken=next_page_token
            ).execute()
            videos += [item['contentDetails']['videoId'] for item in playlist_response['items']]
            next_page_token = playlist_response.get('nextPageToken')
            if next_page_token is None:
                break
        return videos

    def get_transcript(self, video_id):
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            output = ''
            for x in transcript:
                sentence = x['text']
                output += f'{sentence}'
            return output, transcript
        except:
            return None

In [30]:
class TextProcessor:
    
    @staticmethod
    def query_chatgpt(client, prompt):
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}]
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"An error occurred: {e}")
            return None
    
    @staticmethod
    def split_text(chunk_size, text):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = 0
        )
        documents = text_splitter.create_documents([text])
        return documents

In [61]:
class Prompts:
    @staticmethod
    def transcript_fix(content):
        prompt = f"""----- INSTRUCTION -----
In the following, I will provide you a part of a \
podcast-transcript. This podcast covers health-related topics. \
However, the transcript is badly formatted. The following issues can occur: \
(1) Punctuation is missing completely (2) whitespace between words is missing. \
(3) large and lower case is oftentimes off. \
In the following I will give you chunks of this transcript that you have to correct. \
It's possible that the sentence continues after the transcript-snippet ends, \
insert a ' ... ' instead of punctuation. Here's the transcript:
----- TRANSCRIPT ----
{content}
----- CORRECTED TRANSCRIPT ----"""
        return prompt
    
    @staticmethod
    def fix_faulty_splits(content):
        prompt = f"""----- INSTRUCTION -----
In the following, I will provide you a part of a \
podcast-transcript. This podcast covers health-related topics. \
However, the transcript is badly formatted. The following issue can occur: \
When a part of the transcript ends with '...' \
you have to check whether the punctuation makes sense here. If not, delete it along \
with the '...'-indicator. Here's the transcript:
----- TRANSCRIPT ----
{content}
----- CORRECTED TRANSCRIPT ----"""
        return prompt

In [45]:
yt_helper = YouTubeHelper(api_key)

In [46]:
channel_id = yt_helper.get_channel_id(channel_name)
video_ids = yt_helper.get_video_ids(channel_id)

In [72]:
output, transcript = yt_helper.get_transcript("CQlTmOFM4Qs")

In [76]:
documents_stage_1 = TextProcessor.split_text(400, output)

In [None]:
stage_1_text = ""

for doc in tqdm(documents_stage_1):
    original = doc.page_content
    prompt = Prompts.transcript_fix(original)
    processed = TextProcessor.query_chatgpt(client, prompt)
    
    stage_1_text += processed + ".." if processed[-1] == "." and processed[-2] != "." else processed    

100%|██████████| 240/240 [15:26<00:00,  3.86s/it]


In [None]:
with open("hl_podcast_wmemory_stage_1.txt", "w") as f:
    f.write(stage_1_text)

In [83]:
documents_stage_2 = TextProcessor.split_text(750, stage_1_text)

In [84]:
stage_2_text = ""

for doc in tqdm(documents_stage_2):
    original = doc.page_content
    prompt = Prompts.fix_faulty_splits(original)
    processed = TextProcessor.query_chatgpt(client, prompt)
    processed = processed.replace("\n", "")
    
    stage_2_text += processed  

100%|██████████| 139/139 [14:51<00:00,  6.42s/it]


In [85]:
print(stage_2_text[:6000])

Welcome to the Huberman Lab Podcast, where we discuss science and science-based tools for everyday life. [Music] I'm Andrew Huberman, and I'm a professor of neurobiology and ophthalmology at Stanford School of Medicine. Today, we are discussing working memory. Working memory is a special category of memory in which we are able to hold small amounts of information in our mind for short periods of time. It is also very closely related to attention. So for any of you that are interested in how to develop better focus and attention, understanding what working memory is and some of the things that you can do to improve your working memory can be very beneficial. Today, I'm going to talk about what working memory is, including some of the things that you can do to improve it.Underlying biology, although I promise irrespective of whether or not you know any biology or are you an expert in biology, I'll make the conversation accessible to you. In addition, I will talk about tools to improve wo

In [86]:
with open("hl_podcast_wmemory_stage_2.txt", "w") as f:
    f.write(stage_2_text)

In [87]:
!gsutil cp hl_podcast_wmemory_stage_2.txt gs://legalm-staging/huberman_lab/

Copying file://hl_podcast_wmemory_stage_2.txt [Content-Type=text/plain]...
/ [1 files][ 96.8 KiB/ 96.8 KiB]                                                
Operation completed over 1 objects/96.8 KiB.                                     


In [88]:
!gsutil cp hl_podcast_wmemory_stage_1.txt gs://legalm-staging/huberman_lab/

Copying file://hl_podcast_wmemory_stage_1.txt [Content-Type=text/plain]...
/ [1 files][ 97.7 KiB/ 97.7 KiB]                                                
Operation completed over 1 objects/97.7 KiB.                                     
