In [None]:
from dotenv import load_dotenv
import os
import json
from pathlib import Path

import requests
from openai import OpenAI

from typing import List, Any, Dict
import re

In [2]:
CREDS_PATH = Path.cwd() / '..' / '..' / '.env'

In [3]:
load_dotenv(CREDS_PATH)
api_key = os.getenv('OPENAI_API_KEY')

In [4]:
openai_client = OpenAI()

### Q4: How many records are there?

In [5]:
URL = ' https://github.com/DataTalksClub/datatalksclub.github.io/tree/main/_podcast'
DATA = 'https://api.github.com/repos/DataTalksClub/datatalksclub.github.io/contents/_podcast'

In [None]:
class TranscriptEntry:
    def __init__(self, start, text):
        self.start = start
        self.text = text
    
    def __str__(self):
        return f"TranscriptEntry(start={self.start}s, text='{self.text[:50]}{'...' if len(self.text) > 50 else ''}')"
    
    def __repr__(self):
        return self.__str__()

# Try to parse timestamps and text from the markdown
transcript_entries = []
timestamp_pattern = r'(\d{1,2}:\d{2}(?::\d{2})?)'  # Matches MM:SS or HH:MM:SS

for line in lines:
    line = line.strip()
    if not line:
        continue
    
    # Look for timestamp patterns
    match = re.search(timestamp_pattern, line)
    if match:
        timestamp_str = match.group(1)
        parts = timestamp_str.split(':')
        if len(parts) == 2:  # MM:SS
            seconds = int(parts[0]) * 60 + int(parts[1])
        elif len(parts) == 3:  # HH:MM:SS
            seconds = int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
        else:
            continue
        
        # Extract text after timestamp
        text = line[match.end():].strip()
        if text:
            transcript_entries.append(TranscriptEntry(seconds, text))

print(f"\nParsed {len(transcript_entries)} transcript entries")
if transcript_entries:
    print("First few entries:")
    for i, entry in enumerate(transcript_entries[:5]):
        print(f"{i+1}. {entry.start}s: {entry.text[:50]}...")

# Update docs_response to be the parsed transcript entries
docs_response = transcript_entries


=== PARSING MARKDOWN TO TRANSCRIPT FORMAT ===
Total lines in markdown: 1042

First 20 lines:
 1: ---
 2: episode: 8
 3: guests:
 4: - jekaterinakokatjuhha
 5: ids:
 6:   anchor: The-Journey-of-a-Data-Generalist-From-Bioinformatics-to-Freelancing---Jekaterina-Kokatjuhha-e1upvim
 7:   youtube: FRi0SUtxdMw
 8: image: images/podcast/s12e08-journey-of-data-generalist-from-bioinformatics-to-freelancing.jpg
 9: links:
10:   anchor: https://anchor.fm/datatalksclub/episodes/The-Journey-of-a-Data-Generalist-From-Bioinformatics-to-Freelancing---Jekaterina-Kokatjuhha-e1upvim
11:   apple: https://podcasts.apple.com/us/podcast/the-journey-of-a-data-generalist-from/id1541710331?i=1000599125044
12:   spotify: https://open.spotify.com/episode/5fB185hGlGYQmdk0kbIsPv?si=YtnsaYNzTc-fl7emZ2IjEA
13:   youtube: https://www.youtube.com/watch?v=FRi0SUtxdMw
14: season: 12
15: short: 'The Journey of a Data Generalist: From Bioinformatics to Freelancing'
16: title: 'The Journey of a Data Generalist: From Bioinfor

In [9]:
f'There are {len(docs_response)} records'

'There are 128 records'

In [None]:
docs_response[43]

TranscriptEntry(start=1282s, text=''')

### Q5: How many chunks do you have in the result?
* chunk size 30 
* overlap 15

In [11]:
def format_timestamp(seconds: float) -> str:
    """Convert seconds to H:MM:SS if > 1 hour, else M:SS"""
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours > 0:
        return f"{hours}:{minutes:02}:{secs:02}"
    else:
        return f"{minutes}:{secs:02}"

In [12]:
def make_subtitles(transcript: List[Any]) -> str:
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)

In [13]:
def sliding_window(seq: List[Any], size: int, step: int) -> List[List[Any]]:
    """Create overlapping chunks using sliding window approach."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []

    for i in range(0, n, step):
        batch = seq[i:i+size]
        result.append(batch)
        if i + size >= n:
            break

    return result

In [14]:
def join_lines(transcript: List[Any]) -> str:
    """Join transcript entries into continuous text."""
    lines = []

    for entry in transcript:
        text = entry.text.replace('\n', ' ')
        lines.append(text)

    return ' '.join(lines)

In [15]:
def format_chunk(chunk: List[Any]) -> Dict[str, str]:
    """Format a chunk with start/end timestamps and text."""
    time_start = format_timestamp(chunk[0].start)
    time_end = format_timestamp(chunk[-1].start)
    text = join_lines(chunk)

    return {
        'start': time_start,
        'end': time_end,
        'text': text
    }

In [16]:
chunks = []

for chunk in sliding_window(docs_response, 30, 15):  #step = chunk_size - overlap
    processed = format_chunk(chunk)
    chunks.append(processed)

f'There are a total of {len(chunks)} chunks'

'There are a total of 8 chunks'

#### Q6: What's the first episode in the results for "how do I make money with AI?"