In [1]:
from collections import defaultdict
from utils import get_episode_title, get_podcast_details, get_feed_details, search_for_episode, fetch_latest_episode, download_all, call_replicate_api
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import T5ForConditionalGeneration, T5Tokenizer
from pydub import AudioSegment
from tqdm import tqdm
import torch
from faster_whisper import WhisperModel
import concurrent.futures

import json
import os
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
import time

  from tqdm.autonotebook import tqdm, trange


# utils

In [2]:
def update_session(**kwargs):
    for k, v in kwargs.items():
        session_state[k] = v

In [3]:
def choose_podcast_option(**kwargs):
    sentence_encoder = session_state['sentence_encoder']
    if sentence_encoder == "1. T5":
        encoder=SentenceTransformer("sentence-transformers/sentence-t5-base")
        update_session(sentence_encoder_selected=True, sentence_encoder=sentence_encoder, encoder=encoder)

# download

In [4]:
def download_podcast(**kwargs):
    option = kwargs['episode_option']
    if option == "1. Try a sample":
        with open('sample/episode_details.json', 'r') as f:
            episode_details = json.load(f)
    elif option == "2. Provide the iTunes URL for a specific podcast episode":
        episode_url = kwargs['episode_url']
        episode_details = download_episode_from_url(
            episode_url, 
            kwargs['sentence_encoder'],
            encoder=kwargs['encoder'] if 'encoder' in kwargs.keys() else None,
            embedding_client=kwargs['embedding_client'] if 'embedding_client' in kwargs.keys() else None,
            embedding_model=kwargs['embedding_model'] if 'embedding_model' in kwargs.keys() else None
            )
    elif option == "3. Provide a name of a podcast to explore its most recent episode":
        found_podcasts = kwargs['found_podcasts']
        selected_index = kwargs['selected_index']
        episode_details = download_episode_from_name(found_podcasts[selected_index]['collectionId'], found_podcasts[selected_index]['collectionName'])
    return episode_details

In [5]:
def download_episode_from_url(url, sentence_encoder, **kwargs):
    try:
        podcast_id, episode_title = get_episode_title(url)
        podcast_details = get_podcast_details(podcast_id)
        
        feed_details = get_feed_details(
            podcast_details['feedUrl'], 
            sentence_encoder=sentence_encoder, 
            encoder=kwargs['encoder'], 
            embedding_client=kwargs['embedding_client'], 
            embedding_model=kwargs['embedding_model']
            )
        
        episode_details = search_for_episode(
            episode_title, 
            feed_details,
            sentence_encoder=sentence_encoder, 
            encoder=kwargs['encoder'], 
            embedding_client=kwargs['embedding_client'], 
            embedding_model=kwargs['embedding_model']        
            )
        
        if episode_details['cos_sim'] < 0.95:
            raise Exception
        episode_details['filenames'] = []
        episode_details['filenames'] += download_all(episode_details['audio_urls'], podcast_details['collectionName'])
        episode_details['status'] = 'Success'
        episode_details['status_message'] = f"Podcast {podcast_details['collectionName']} downloaded successfully."
    except Exception:
        episode_details = {}
        episode_details['status'] = 'Fail'
        episode_details['status_message'] = "Failed to download the podcast. Please try again."
        return episode_details

    return episode_details

# transcribe

In [6]:
def shrink_and_split_mp3(mp3_file, n_splits):
    """
    Shrinks and splits the MP3 file into `n_splits` parts and returns the split parts in a list.

    Args:
    mp3_file (str): Path to the input MP3 file.
    n_splits (int): The number of parts to split the audio file into.

    Returns:
    list: A list of file paths for the split audio parts.
    """
    directory, filename = os.path.split(mp3_file)
    basename, ext = os.path.splitext(filename)
    
    # Load the audio file
    audio = AudioSegment.from_mp3(mp3_file)
    
    # Set desired sample rate and bit depth to control size
    audio = audio.set_frame_rate(16000)
    audio = audio.set_sample_width(16 // 8)  # 16 bits = 2 bytes    
    audio = audio.set_channels(1)  # Convert to mono
    
    # Get the total length of the audio file (in milliseconds)
    audio_length = len(audio)
    
    # Calculate the duration for each split part
    split_duration = audio_length // n_splits
    
    # List to store the paths of the split audio parts
    split_paths = []
    
    # Loop to split and export each part
    for i in range(n_splits):
        start_time = i * split_duration
        # Handle the last split to include any remaining audio
        end_time = (i + 1) * split_duration if i < n_splits - 1 else audio_length
        split_audio = audio[start_time:end_time]
        
        # Generate the file name for each split part
        split_path = os.path.join(directory, f"{basename}_part_{i+1}{ext}")
        
        # Export the split part
        split_audio.export(split_path, format="mp3")
        
        # Add the split part path to the list
        split_paths.append(split_path)
    
    return split_paths
    
def store_json_data(data, file_path):
    try:
        # Ensure the directory exists
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        
        # Write the dictionary to the JSON file
        with open(file_path, 'w') as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Data successfully stored in {file_path}")
    except Exception as e:
        print(f"An error occurred while storing the data: {e}")

def chunk_text_into_sentences(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    return sentences

def transcribe(file_path):
  # define our torch configuration
  device = "cuda" if torch.cuda.is_available() else "cpu"
  # compute_type = "float16" if torch.cuda.is_available() else "float32"
  compute_type = "int8"

  # load model on GPU if available, else cpu
  model = WhisperModel("distil-large-v3", device=device, compute_type=compute_type)
    
  # fast whisper large 3
  final_transcription = ""
  segments, info = model.transcribe(file_path, beam_size=1)

  # Initialize the progress bar
  pbar = tqdm(total=len(AudioSegment.from_file(file_path)) / 1000.0, unit='s')

  for segment in segments:
      final_transcription += segment.text
      pbar.update(segment.end - segment.start)

  # Close the progress bar
  pbar.close()

  # Chunk the transcription text into sentences
  sentences = chunk_text_into_sentences(final_transcription)

  chunks = []
  for sentence in sentences:
      tmp_dict = {}
      tmp_dict['text'] = sentence
      chunks.append(tmp_dict)
        
  print("Audio transcription complete")
  file_dir = "/".join(file_path.split("/")[:-1])
  data = {"chunks": chunks, "text": final_transcription}
  store_json_data(data, os.path.join(file_dir, "stored_data.json"))
  return data


# def transcribe_with_replicate(replicate_client, mp3_file):
def transcribe_with_whistler(filenames, n_splits=2):
    start_time = time.time()
    mp3_files = shrink_and_split_mp3(filenames[0], n_splits)
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_splits) as executor:        
        # Submit the transcription tasks to the executor
        futures = {
            executor.submit(transcribe, mp3_file): i for i, mp3_file in enumerate(mp3_files)
        }
        # Collect the results in the original order based on submission index
        results = [None] * len(mp3_files)
        for future in concurrent.futures.as_completed(futures):
            i = futures[future]  # Index of the mp3 file part
            results[i] = future.result()  # Store result in the correct index
    chunks = []
    text = ''
    for output in results:
        chunks += output['chunks']
        text = " ".join([text, output['text']])
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Time to run the command: {execution_time} seconds")
    return chunks, text

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/minasonbol/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
def transcribe_podcast(**kwargs):
    podcast_option = kwargs['episode_option']
    episode_details = kwargs['episode_details']
    
    if podcast_option == "1. Try a sample":
        chunks, text = episode_details['chunks'], episode_details['text']
    else:
        transcription_method = kwargs['transcription_method']
        if transcription_method == "1. Replicate":
            chunks, text = transcribe_with_replicate(kwargs['transcription_client'], episode_details['filenames'])
        elif transcription_method == "2. Local transcription":
            chunks, text = transcribe_with_whistler(episode_details['filenames'], n_splits=2) # 4 splits took 800 seconds / try it in streamlit
                        
    return {'chunks': chunks, 'text': text}

# encoding

In [8]:
def create_t5_embedding(encoder, chunks):
    documents = []

    for sentence in chunks:
        temp_dict = {
            'text': sentence['text'],
            'text_vector': encoder.encode(sentence["text"]).tolist()
        }
        documents.append(temp_dict)
    
    return documents

def encode_podcast(**kwargs):
    episode_details = kwargs['episode_details']
    sentence_encoder = kwargs['sentence_encoder']

    if sentence_encoder == "1. T5":
        documents = create_t5_embedding(kwargs['encoder'], episode_details['chunks'])
    elif sentence_encoder == "2. OpenAI":
        documents = create_oa_embedding(kwargs['embedding_client'], kwargs['embedding_model'], episode_details['chunks'])

    return {'documents': documents}

# indexing

In [34]:
def create_chroma_index(client, index_name):
    existing_collections = client.list_collections()

    # Check if the collection exists
    if index_name in [collection.name for collection in existing_collections]:
        # Delete the collection if it exists
        client.delete_collection(index_name)

    # Create or get a collection with cosine distance
    index = client.get_or_create_collection(
        name=index_name,
        metadata={"hnsw:space": "cosine"}
    )

    return index

def create_index(**kwargs):
    vector_db = kwargs['vector_db']

    if vector_db == "1. Minsearch":
        return create_minsearch_index(index_name=kwargs['index_name'])
    elif vector_db == "2. Elasticsearch":
        return create_es_index(client=kwargs['vector_db_client'], index_name=kwargs['index_name'])
    elif vector_db == "3. ChromaDB":
        return create_chroma_index(client=kwargs['vector_db_client'], index_name=kwargs['index_name'])

def populate_chroma_collection(documents, collection):
    ids = [str(i) for i in range(len(documents))]
    embeddings = [doc['text_vector'] for doc in documents]
    texts = [doc['text'] for doc in documents]
    
    collection.add(
        ids=ids,
        embeddings=embeddings,
        metadatas=[{"text": text} for text in texts]
    )

def index_podcast(**kwargs):
    episode_details = kwargs['episode_details']
    vector_db = kwargs['vector_db']

    if vector_db=="1. Minsearch":
        populate_minsearch_index(episode_details['chunks'], kwargs['index'])
    elif vector_db=="2. Elasticsearch":
        populate_es_index(episode_details['documents'], kwargs['index_name'], kwargs['vector_db_client'])
    elif vector_db=="3. ChromaDB":
        populate_chroma_collection(episode_details['documents'], kwargs['index'])        


# rag

In [10]:
def search(query, **kwargs):
    vector_db = kwargs['vector_db']

    if vector_db == "1. Minsearch":
        boost = {'text':3.0}
        results = kwargs['index'].search(
            query=query,
            boost_dict=boost, 
            num_results=kwargs['num_results']
        )
    elif vector_db=="2. Elasticsearch":
        # Encode the query
        if kwargs['sentence_encoder'] == "1. T5":
            query_vector = kwargs['encoder'].encode(query).tolist()
        elif kwargs['sentence_encoder'] == "2. OpenAI":
            query_vector = kwargs['embedding_client'].embeddings.create(model=kwargs['embedding_model'], input=query).data[0].embedding[:768]

        # Construct the search query
        search_query = {
            "size": kwargs['num_results'],  # Limit the number of results
            "query": {
                "script_score": {
                    "query": {
                        "match_all": {}
                    },
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'text_vector') + 1.0",
                        "params": {
                            "query_vector": query_vector
                        }
                    }
                }
            }
        }
        # Execute the search query
        results = kwargs['vector_db_client'].search(index=kwargs['index_name'], body=search_query)
        results = results['hits']['hits']
    elif vector_db=="3. ChromaDB":
        # Encode the query
        if kwargs['sentence_encoder'] == "1. T5":
            query_vector = kwargs['encoder'].encode(query).tolist()
        elif kwargs['sentence_encoder'] == "2. OpenAI":
            query_vector = kwargs['embedding_client'].embeddings.create(model=kwargs['embedding_model'], input=query).data[0].embedding[:768]
        
        # Perform cosine similarity search in ChromaDB
        results = kwargs['vector_db_client'].get_or_create_collection(kwargs['index_name']).query(
            query_embeddings=[query_vector],
            n_results=kwargs['num_results'],
            include=["metadatas", "documents", "distances"]
        )
        
        # return results["metadatas"]
        results = results["metadatas"][0]


    return results

# prompt
def build_prompt(query, search_results):
    prompt_template = """
    You're a podcast chat bot. Answer the QUESTION based on the CONTEXT from the RESULTS database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()
    
    context = ""
    
    for search_result in search_results:
        doc = search_result['_source']['text'] if '_source' in search_result.keys() else search_result['text']
        context = context + f"{doc}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

# generate
def llm(prompt, **kwargs):
    if kwargs['llm_option'] == "1. GPT-4o":
        outputs = kwargs['llm_client'].chat.completions.create(
            model='gpt-4o',
            messages=[{'role': 'user', 'content': prompt}]
        )
        response = outputs.choices[0].message.content
    elif kwargs['llm_option'] == "2. FLAN-5":
        inputs = kwargs['llm_tokenizer'](prompt, return_tensors="pt")
        outputs = kwargs['llm_client'].generate(
            inputs["input_ids"], 
            max_length=100,
            num_beams=5,
            do_sample=True,
            temperature=1.0,
            top_k=50,
            top_p=0.95,    
            )
        response = kwargs['llm_tokenizer'].decode(outputs[0], skip_special_tokens=True)
        
    return response

# rag 
def rag(query, **kwargs):

    search_results = search(
        query, 
        vector_db=kwargs['vector_db'], 
        sentence_encoder=kwargs['sentence_encoder'], 
        encoder=kwargs['encoder'] if 'encoder' in kwargs.keys() else None, 
        index_name=kwargs['index_name'], 
        index=kwargs['index'], 
        vector_db_client=kwargs['vector_db_client'] if 'vector_db_client' in kwargs.keys() else None, 
        embedding_model=kwargs['embedding_model'] if 'embedding_model' in kwargs.keys() else None, 
        embedding_client=kwargs['embedding_client'] if 'embedding_client' in kwargs.keys() else None,
        num_results=5
        )

    prompt = build_prompt(query, search_results)

    answer = llm(
        prompt, 
        llm_option=kwargs['llm_option'], 
        llm_client=kwargs['llm_client'], 
        llm_tokenizer=kwargs['llm_tokenizer'] if 'llm_tokenizer' in kwargs.keys() else None
        )
    
    for word in answer.split():
        yield word + " "
        time.sleep(0.05)


# Main

In [11]:
session_state = defaultdict(
    episode_option = "2. Provide the iTunes URL for a specific podcast episode",
    episode_url = 'https=//podcasts.apple.com/us/podcast/election-economics-how-can-presidents-really-impact/id1256091892?i=1000671647528',
    sentence_encoder = "1. T5",
    transcription_method = "2. Local transcription",
    vector_db = "3. ChromaDB",
    llm_option = "2. FLAN-5",
)

In [12]:
choose_podcast_option(**session_state)



In [13]:
episode_details = download_podcast(**session_state)
if episode_details['status'] == 'Success':
    print(episode_details['status_message'])
    update_session(episode_details=episode_details, podcast_downloaded=True)

Audio file downloaded successfully: ./audio/NerdWallet's_Smart_Money_Podcast/episode_0.mp3
Podcast NerdWallet's Smart Money Podcast downloaded successfully.


In [15]:
# 4 splits -> 800s
if session_state['podcast_downloaded'] and not session_state.get('interaction_started', False):
    session_state['episode_details'].update(transcribe_podcast(**session_state))
    update_session(podcast_transcribed=True)

  0%|                                                                                                                                                                  | 0/380.173 [00:00<?, ?s/s]
  0%|                                                                                                                                                                  | 0/380.173 [00:00<?, ?s/s][A

  0%|                                                                                                                                                                  | 0/380.173 [00:00<?, ?s/s][A[A


  1%|█▉                                                                                                                                                     | 4.84/380.173 [00:03<05:08,  1.22s/s][A[A[A


  1%|█▍                                                                                                                                                     | 3.48/380.173 [00:03<06:36,  1.05s/s][A

Audio transcription complete
Data successfully stored in ./audio/NerdWallet's_Smart_Money_Podcast/stored_data.json




 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 336.6799999999999/380.173 [12:53<01:39,  2.30s/s][A[A


Audio transcription complete
Data successfully stored in ./audio/NerdWallet's_Smart_Money_Podcast/stored_data.json



 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 353.0200000000001/380.173 [12:54<00:59,  2.19s/s][A



 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 363.44000000000017/380.173 [12:54<00:35,  2.13s/s][A[A[A

Audio transcription complete
Data successfully stored in ./audio/NerdWallet's_Smart_Money_Podcast/stored_data.json
Audio transcription complete
Data successfully stored in ./audio/NerdWallet's_Smart_Money_Podcast/stored_data.json
Time to run the command: 831.9737937450409 seconds





In [14]:
# 2 splits - 600s
if session_state['podcast_downloaded'] and not session_state.get('interaction_started', False):
    session_state['episode_details'].update(transcribe_podcast(**session_state))
    update_session(podcast_transcribed=True)

  0%|                                                                                                                                                                  | 0/760.346 [00:00<?, ?s/s]
  0%|                                                                                                                                                                  | 0/760.346 [00:00<?, ?s/s][A
  4%|█████▍                                                                                                                                                | 27.82/760.346 [00:22<09:34,  1.27s/s][A
  4%|█████▉                                                                                                                                                 | 29.9/760.346 [00:22<08:52,  1.37s/s][A
  8%|██████████▎                                                                                                                              | 57.440000000000005/760.346 [00:45<09:20,  1.25s/s][A
 11%|████████

Audio transcription complete
Data successfully stored in ./audio/NerdWallet's_Smart_Money_Podcast/stored_data.json


 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 708.2600000000002/760.346 [10:10<00:44,  1.16s/s]

Audio transcription complete
Data successfully stored in ./audio/NerdWallet's_Smart_Money_Podcast/stored_data.json
Time to run the command: 644.014075756073 seconds





In [17]:
# encode
if session_state['podcast_transcribed'] and not session_state.get('interaction_started', False):
    if session_state['vector_db'] != "1. Minsearch":
        try:
            session_state['episode_details'].update(encode_podcast(**session_state))
            update_session(podcast_embedded=True)
        except:
            print("Encoding failed.")          
            update_session(podcast_embedded=False)
    else:
        update_session(podcast_embedded=True)


In [35]:
session_state['index_name'] = "podcast-transcriber"
vector_db = session_state['vector_db']
if vector_db=="3. ChromaDB":
    # chroma_client = chromadb.PersistentClient(path="./chroma_db")
    chroma_client = chromadb.EphemeralClient()
    update_session(vector_db=vector_db, vector_db_client=chroma_client)
    update_session(index=create_index(**session_state))
    update_session(vector_db_selected=True, index_created=True)
    print(f"Index {session_state['vector_db_client'].list_collections()[0].name} was created successfully.")

Index sample_index was created successfully.


In [24]:
client.get_or_create_collection(
    name='sample_index',
    metadata={"hnsw:space": "cosine"}
)

Collection(id=1682247b-21d4-4008-bbf8-ddd49efa7ab0, name=sample_index)

In [36]:
# populate index
if session_state['podcast_embedded'] and not session_state.get('interaction_started', False):
    index_podcast(**session_state)
    update_session(podcast_indexed=True)

In [38]:
# llm
llm_option = "2. FLAN-5"
llm_client = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
llm_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
llm_option_selected=True
update_session(llm_option=llm_option, llm_client=llm_client, llm_tokenizer=llm_tokenizer, llm_option_selected=llm_option_selected)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [41]:
# interact
query = "Does the president really have power over the economy?"
response = rag(
    query, 
    **session_state
    )
print(" ".join(list(response)))

the  short  version  is  that  although  the  president  plays  a  role  in  staffing  these  positions  and  these  positions  impact  the  economy,  it  doesn't  really  equate  to  presidential  power  over  the  economy 


In [67]:
if session_state['podcast_downloaded'] and not session_state.get('interaction_started', False):
    session_state['episode_details'].update(transcribe_podcast(**session_state))
    update_session(podcast_transcribed=True)

 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 775.9200000000006/827.873 [05:22<00:21,  2.41s/s]

Audio transcription complete
./audio/NerdWallet's_Smart_Money_Podcast
 Today's episode is supported by Money Girl. Let's be honest. Nothing kills your productivity faster than worrying about debt, wondering if you are managing your money right, or getting that cold sweat when someone mentions retirement planning. Yikes. But don't worry, there is a fix. If you're always looking for ideas to help you stop stressing about your finances, then it's time to check out a podcast called Money Girl. Money Girl host Laura Adams has been helping listeners tackle their finances for over 15 years. In that time, she's seen it all. Recessions, pandemics, and everything in between. So you can bet there's no money questions she hasn't heard or solved. Whether you're trying to budget, save, or just figure out how to stop impulse buying things at 3 a.m., Laura's got you covered. Each episode is under 20 minutes, perfect for when you need a quick dose of financial wisdom between meetings or while waiting f




TypeError: 'NoneType' object is not iterable

In [96]:
if session_state['podcast_downloaded'] and not session_state.get('interaction_started', False):
    session_state['episode_details'].update(transcribe_podcast(**session_state))
    update_session(podcast_transcribed=True)

  0%|                                                                                                                                                                  | 0/413.937 [00:00<?, ?s/s]
  1%|█▋                                                                                                                                                     | 4.66/413.937 [00:02<03:39,  1.87s/s][A
  7%|██████████▏                                                                                                                              | 30.740000000000006/413.937 [00:19<04:05,  1.56s/s][A
 15%|███████████████████▉                                                                                                                     | 60.320000000000014/413.937 [00:37<03:36,  1.63s/s][A
 21%|████████████████████████████▉                                                                                                             | 86.78000000000002/413.937 [00:56<03:35,  1.52s/s][A
 28%|████████

Audio transcription complete
 Today's episode is supported by Money Girl. Let's be honest. Nothing kills your productivity faster than worrying about debt, wondering if you are managing your money right, or getting that cold sweat when someone mentions retirement planning. Yikes. But don't worry, there is a fix. If you're always looking for ideas to help you stop stressing about your finances, then it's time to check out a podcast called Money Girl. Money Girl host Laura Adams has been helping listeners tackle their finances for over 15 years. In that time, she's seen it all, recessions, pandemics, and everything in between. So you can bet there's no money questions she hasn't heard or solved. Whether you're trying to budget, save, or just figure out how to stop impulsifying things at 3 a.m., Laura's got you covered. Each episode is under 20 minutes, perfect for when you need a quick dose of financial wisdom between meetings or while waiting for your coffee. And while we answer your mo

 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 399.1/413.937 [04:49<00:10,  1.38s/s]

Audio transcription complete
 but we do have a house together, and there are a lot of expenses that come with that. There's property tax. There's water, electric, all sorts of things that pop up. And what that ends up meaning is that while I have some bills in my name and he has some bills in his name, we just end up Venmoing each other a lot. And it's maybe not the most graceful thing when I think, okay, I'm sending him 50 bucks for this. He's sending me 60 bucks for that. It seems a little silly, but it just works out for us, and that's where we are right now, and that's fine. Yeah, absolutely. There are a lot of different ways to handle the mostly separate setup. You can each contribute a certain percentage of income or a certain dollar amount. But again, it's totally fine to keep things completely separate if that's what works best for you. If you do have joint accounts or mostly joint accounts, there usually needs to be some sort of what we call an ask limit. In other words, if yo




TypeError: 'NoneType' object is not iterable

In [103]:
# session_state['text']

In [78]:
session_state.keys()

dict_keys(['episode_option', 'episode_url', 'sentence_encoder', 'transcription_method', 'vector_db', 'llm_option', 'sentence_encoder_selected', 'encoder', 'episode_details', 'podcast_downloaded', 'transcription_result'])

In [70]:
with open("./audio/NerdWallet's_Smart_Money_Podcast/stored_data.json") as f:
    transcript = json.load(f)

In [72]:
transcript['transcription'][:100]

" Today's episode is supported by Money Girl. Let's be honest. Nothing kills your productivity faster"

In [88]:
chunks = []
for sentence in sentences:
    tmp_dict = {}
    tmp_dict['text'] = sentence
    chunks.append(tmp_dict)

In [89]:
chunks

[{'text': " Today's episode is supported by Money Girl."},
 {'text': "Let's be honest."},
 {'text': 'Nothing kills your productivity faster'}]

In [85]:
session_state['transcription_result'].keys()

dict_keys(["NerdWallet's_Smart_Money_Podcast_0"])