## Youtube video search PoC

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
import pytube
import os
from pathlib import Path
from moviepy.editor import *

from tqdm.auto import tqdm  # !pip install tqdm

import whisper
import torch  # pytorch install steps: pytorch.org

import pinecone # pip install --upgrade pinecone-client
from sentence_transformers import SentenceTransformer

## Inputs

In [2]:
channel_names = ['@FinancialTimes']
#channel_names = ['UCfkpRl1dykCRyyspOB0hffQ', 'ProjectFarm', 'PaintballOO7', 'StrangeLoopConf', 'FarmCraft101', 'standupmaths', 'NDCConferences', 'TheRoyalInstitution', 'bobreynolds', 'abadcliche', 'flutterdev', 'UCCkL8DHm5KSE5MiDWepOHUw', 'MIT', 'msadaghd', 'HooverInstitution', 'Kurzgesagt', 'UCm9K6rby98W8JigLoZOh6FQ', 'Luapper', 'shanselman', 'destinws2', 'steventhebrave', '1veritasium', 'TheoBrowne1017']
pinecone_api_key = 'c65fa925-08e1-4af0-b08b-1104c6ffba25' # https://app.pinecone.io/projects

youtube_dl_options = {
    'skip_download': True,
    'ignoreerrors': True
}

## Helper functions

In [14]:
def extract_video_ids(text):
    video_ids = []
    pattern = r'(?<=watch\?v=)[\w-]+'
    matches = re.findall(pattern, text)
    for match in matches:
        video_ids.append(match)
    return video_ids

def extract_video_titles(html):
    video_ids = []
    pattern = r'\"title":{"runs":\[{"text":".*?\"'
    matches = re.findall(pattern, html)
    for match in matches:
        #video_ids.append(match)
        
        pattern2 = r'\"text":".*?\"'
        matches2 = re.findall(pattern2, match)
        for match2 in matches2:
            text = match2.replace("\"text\":\"", "")[:-1]
            text = text.replace(":", "")
            text = text.replace(".", "")
            text = text.replace("?", "")
            text = text.replace("'", "")
            text = text.replace("\\", "")
            text = text.replace("|", "")
            text = text.replace(",", "")
            text = text.replace("xe2x80x99", "’")
            video_ids.append(text)
    return video_ids

def download_youtube_video(url, filepath):
    youtube = pytube.YouTube(url)
    video = youtube.streams.get_highest_resolution()
    video.download(filepath)
    
def download_channel_videos(channel_name):
    video_index = {}
    # Create a directory to store the videos
    if not os.path.exists(channel_name):
        os.mkdir(channel_name)
    # Get the HTML content of the channel page
    url = f"https://www.youtube.com/{channel_name}/videos"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"{channel_name} does not exist or has no videos.")
    
        
    html_content = response.content
    
    results = extract_video_ids(str(html_content))
    titles = extract_video_titles(str(html_content))
    count = 0;
    for result in results:
        video_index[titles[count]] = result
        count = count + 1
    
    video_urls = []
    for result in results:
        video_urls.append(f"https://www.youtube.com/watch?v={result}")
        
    for url in video_urls:
        print(url, channel_name)
        youtube = pytube.YouTube(url)
        video = youtube.streams.get_lowest_resolution()
        video.download(channel_name)

    print(f"All videos from {channel_name} have been downloaded.")
    return video_index

def get_mp4_files(directory):
    mp4_files = []
    for file in os.listdir(directory):
        if file.endswith(".mp4"):
            mp4_files.append(os.path.join(directory, file))
    return mp4_files

def save_audio_from_videoIDs(directory):
    video_IDs = get_mp4_files(directory)
    for videoID in tqdm(video_IDs):
        check_file = f"{videoID.replace('.mp4', '.mp3')}"
        file = Path(check_file)
        if file.exists():
            print('skipping file ', check_file)
            continue
        MP4ToMP3(f"{videoID}", f"{videoID.replace('.mp4', '.mp3')}")

def MP4ToMP3(mp4, mp3):
    FILETOCONVERT = AudioFileClip(mp4)
    FILETOCONVERT.write_audiofile(mp3)
    FILETOCONVERT.close()
    
def get_text_from_data(start, end, data):
    text = ""
    for i in range(start,end):
        text += data[i]['text']+' '
    return text

## Use whisper for transcriptions for audio extracted from videos

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model = whisper.load_model("small").to(device)

model_id = "multi-qa-mpnet-base-dot-v1"
model_embed = SentenceTransformer(model_id)
dim = model_embed.get_sentence_embedding_dimension()

index_id = "audio"
pinecone.init(
    api_key=pinecone_api_key,  # app.pinecone.io
    environment="us-west1-gcp"
)
if index_id not in pinecone.list_indexes():
    pinecone.create_index(
        index_id,
        dim,
        metric="dotproduct"
    )

index = pinecone.Index(index_id)
index.describe_index_stats()

# we encode and insert in batches of 64
batch_size = 64

cuda


## Write embeddings to pinecone as we process each file

In [None]:
for channel_name in channel_names:
    try:
        video_index = download_channel_videos(channel_name)
        save_audio_from_videoIDs(channel_name)
    except:
        print("error getting video")

    # get list of MP3 audio files
    paths = [str(x) for x in Path(channel_name).glob('*.mp3')]
    
    transcriptions = []
    for i, path in enumerate(tqdm(paths)):
        _id = path.split('/')[-1][:-4]
        # transcribe to get speech-to-text data
        print(path)
        try:
            result = model.transcribe(path)
        except:
            print('error, removing file ',path)
            os.remove(path)
            continue
        # add results to data list
        transcriptions.extend(result['segments'])
        
        # set window (length of text chunk) and stride
        window = 1
        stride = 1  # smaller stride creates overlap
        
        data = []
        results = []
    
        with open("transcription.jsonl", "w", encoding="utf-8") as fp:
            _id = path.split('/')[-1][:-4]
            # transcribe to get speech-to-text data
            result = model.transcribe(path)
            segments = result['segments']
            for j in range(0, len(segments), stride):
                j_end = min(j+window, len(segments)-1)
                text = ''.join([x["text"] for x in segments[j:j_end]])
                start = segments[j]['start']
                end = segments[j_end]['end']
                row_id = f"{_id}-t{segments[j]['start']}"
                meta = {
                    **{
                        "id": row_id,
                        "text": text.strip(),
                        "start": start,
                        "end": end,
                        "url": f"https://youtu.be/{video_index[_id]}",
                        "name":_id,
                        "title":_id
                    }
                }
                data.append(meta)
                json.dump(meta, fp)
                fp.write('\n')
                    
        new_data = []
        
        window = 6  # number of sentences to combine
        stride = 3  # number of sentences to 'stride' over, used to create overlap
        
        for i in tqdm(range(0, len(data), stride)):
            i_end = min(len(data)-1, i+window)
            if data[i]['name'] != data[i_end]['name']:
                # in this case we skip this entry as we have start/end of two videos
                continue
            text = get_text_from_data(i, i_end, data)
            new_data.append({
                'start': data[i]['start'],
                'end': data[i_end]['end'],
                'text': text,
                'id': data[i]['id'],
                'url': data[i]['url']+'?t='+str(int(data[i]['start'])),
                "name":data[i]['name'],
                "title":data[i]['title'],
            })
            
        # loop through in batches of 64
        index = pinecone.Index(index_id)
        for j in tqdm(range(0, len(new_data), batch_size)):
            # find end position of batch (for when we hit end of data)
            j_end = min(len(new_data)-1, j+batch_size)
            # extract the metadata like text, start/end positions, etc
            batch_meta = [{
                "text": new_data[x]["text"],
                "start": new_data[x]["start"],
                "end": new_data[x]["end"],
                "url": new_data[x]["url"],
                "name": new_data[x]["name"],
                "title": new_data[x]["title"]
            } for x in range(j, j_end)]
            # extract only text to be encoded by embedding model
            batch_text = [row['text'] for row in new_data[j:j_end]]
            # create the embedding vectors
            batch_embeds = model_embed.encode(batch_text).tolist()
            # extract IDs to be attached to each embedding and metadata
            batch_ids = [row['id'] for row in new_data[j:j_end]]
            # 'upsert' (eg insert) IDs, embeddings, and metadata to index
            try:
                to_upsert = list(zip(batch_ids, batch_embeds, batch_meta))
                index.upsert(to_upsert)
            except:
                continue
        print('removing file ',path)
        os.remove(path)

https://www.youtube.com/watch?v=RETO42jnuJM @FinancialTimes
https://www.youtube.com/watch?v=yGGzimG8VMQ @FinancialTimes
https://www.youtube.com/watch?v=6cxalUYGU8g @FinancialTimes
https://www.youtube.com/watch?v=TXB7HIkQtwA @FinancialTimes
https://www.youtube.com/watch?v=Rj8qVwe_g1Y @FinancialTimes
https://www.youtube.com/watch?v=dKEY71Dpo1w @FinancialTimes
https://www.youtube.com/watch?v=hdPpsHqONEg @FinancialTimes
https://www.youtube.com/watch?v=xWIdu0B1Arc @FinancialTimes
https://www.youtube.com/watch?v=95jabRNJ6Ng @FinancialTimes
https://www.youtube.com/watch?v=6UY2HOpuTlk @FinancialTimes
https://www.youtube.com/watch?v=hW_liASsmrc @FinancialTimes
https://www.youtube.com/watch?v=DKYKT4pvYYA @FinancialTimes
https://www.youtube.com/watch?v=EQw1lajDICY @FinancialTimes
https://www.youtube.com/watch?v=jvFBl9Lv_YA @FinancialTimes
https://www.youtube.com/watch?v=uhS4n2EO7q0 @FinancialTimes
https://www.youtube.com/watch?v=nrk-8KfLY98 @FinancialTimes
https://www.youtube.com/watch?v=f-aHbL2m

  0%|          | 0/30 [00:00<?, ?it/s]


[A                                                                                                                     
chunk:  73%|████████████████████████████████████████████                | 4844/6595 [01:17<00:00, 2709.07it/s, now=None][A

skipping file  @FinancialTimes/Capture whos looking after the children  FT Film Standpoint.mp3
skipping file  @FinancialTimes/First look at celebrity chef Marcus Samuelssons new NYC restaurant  FT Globetrotter.mp3
MoviePy - Writing audio in @FinancialTimes/How the buyout of Morrisons turned into a costly blunder  FT Due Diligence.mp3




chunk:   0%|                                                                         | 0/3246 [00:00<?, ?it/s, now=None][A[A

chunk:   5%|██▉                                                          | 157/3246 [00:00<00:01, 1568.83it/s, now=None][A[A

chunk:  10%|██████▏                                                      | 327/3246 [00:00<00:01, 1636.15it/s, now=None][A[A

chunk:  15%|█████████▎                                                   | 495/3246 [00:00<00:01, 1655.37it/s, now=None][A[A

chunk:  20%|████████████▍                                                | 661/3246 [00:00<00:01, 1612.53it/s, now=None][A[A

chunk:  26%|███████████████▌                                             | 831/3246 [00:00<00:01, 1643.35it/s, now=None][A[A

chunk:  31%|██████████████████▋                                          | 996/3246 [00:00<00:01, 1615.09it/s, now=None][A[A

chunk:  36%|█████████████████████▍                                      | 1158/3246 [00:00<00:01, 1572

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/US bank branch closures widen social inequality  FT Film.mp3




chunk:   0%|                                                                        | 0/42889 [00:00<?, ?it/s, now=None][A[A

chunk:   0%|▎                                                           | 179/42889 [00:00<00:24, 1779.16it/s, now=None][A[A

chunk:   1%|▌                                                           | 377/42889 [00:00<00:22, 1886.31it/s, now=None][A[A

chunk:   1%|▊                                                           | 566/42889 [00:00<00:24, 1745.48it/s, now=None][A[A

chunk:   2%|█                                                           | 742/42889 [00:00<00:24, 1719.92it/s, now=None][A[A

chunk:   2%|█▎                                                          | 915/42889 [00:00<00:24, 1716.97it/s, now=None][A[A

chunk:   3%|█▍                                                         | 1088/42889 [00:00<00:24, 1699.09it/s, now=None][A[A

chunk:   3%|█▋                                                         | 1272/42889 [00:00<00:23, 1743

chunk:  29%|████████████████▌                                         | 12236/42889 [00:06<00:17, 1748.82it/s, now=None][A[A

chunk:  29%|████████████████▊                                         | 12412/42889 [00:06<00:17, 1736.02it/s, now=None][A[A

chunk:  29%|█████████████████                                         | 12592/42889 [00:06<00:17, 1748.68it/s, now=None][A[A

chunk:  30%|█████████████████▎                                        | 12767/42889 [00:06<00:17, 1722.52it/s, now=None][A[A

chunk:  30%|█████████████████▌                                        | 12946/42889 [00:07<00:17, 1741.82it/s, now=None][A[A

chunk:  31%|█████████████████▋                                        | 13121/42889 [00:07<00:17, 1716.19it/s, now=None][A[A

chunk:  31%|█████████████████▉                                        | 13293/42889 [00:07<00:17, 1716.43it/s, now=None][A[A

chunk:  31%|██████████████████▎                                       | 13497/42889 [00:07<00:16, 1810.7

chunk:  87%|██████████████████████████████████████████████████▍       | 37325/42889 [00:19<00:03, 1780.22it/s, now=None][A[A

chunk:  87%|██████████████████████████████████████████████████▋       | 37516/42889 [00:19<00:02, 1817.56it/s, now=None][A[A

chunk:  88%|██████████████████████████████████████████████████▉       | 37701/42889 [00:19<00:02, 1826.78it/s, now=None][A[A

chunk:  88%|███████████████████████████████████████████████████▎      | 37901/42889 [00:20<00:02, 1877.62it/s, now=None][A[A

chunk:  89%|███████████████████████████████████████████████████▌      | 38094/42889 [00:20<00:02, 1892.56it/s, now=None][A[A

chunk:  89%|███████████████████████████████████████████████████▊      | 38291/42889 [00:20<00:02, 1914.99it/s, now=None][A[A

chunk:  90%|████████████████████████████████████████████████████      | 38485/42889 [00:20<00:02, 1916.87it/s, now=None][A[A

chunk:  90%|████████████████████████████████████████████████████▎     | 38677/42889 [00:20<00:02, 1916.6

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Celebrating 130 years of FT pink.mp3




chunk:   0%|                                                                         | 0/2409 [00:00<?, ?it/s, now=None][A[A

chunk:   8%|████▉                                                        | 193/2409 [00:00<00:01, 1924.69it/s, now=None][A[A

chunk:  16%|█████████▊                                                   | 388/2409 [00:00<00:01, 1937.73it/s, now=None][A[A

chunk:  24%|██████████████▊                                              | 586/2409 [00:00<00:00, 1954.81it/s, now=None][A[A

chunk:  32%|███████████████████▊                                         | 782/2409 [00:00<00:00, 1946.31it/s, now=None][A[A

chunk:  41%|████████████████████████▊                                    | 980/2409 [00:00<00:00, 1953.37it/s, now=None][A[A

chunk:  49%|█████████████████████████████▎                              | 1176/2409 [00:00<00:00, 1904.89it/s, now=None][A[A

chunk:  57%|██████████████████████████████████                          | 1367/2409 [00:00<00:00, 1890

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Is stakeholder capitalism building a better world or just empowering the rich  FT Moral Money.mp3




chunk:   0%|                                                                         | 0/3827 [00:00<?, ?it/s, now=None][A[A

chunk:   4%|██▎                                                          | 142/3827 [00:00<00:02, 1419.29it/s, now=None][A[A

chunk:   8%|████▉                                                        | 306/3827 [00:00<00:02, 1515.61it/s, now=None][A[A

chunk:  13%|███████▊                                                     | 488/3827 [00:00<00:02, 1644.86it/s, now=None][A[A

chunk:  17%|██████████▍                                                  | 653/3827 [00:00<00:01, 1602.87it/s, now=None][A[A

chunk:  21%|████████████▉                                                | 814/3827 [00:00<00:01, 1564.64it/s, now=None][A[A

chunk:  26%|███████████████▊                                             | 991/3827 [00:00<00:01, 1628.56it/s, now=None][A[A

chunk:  30%|██████████████████▏                                         | 1162/3827 [00:00<00:01, 1647

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/A bespoke approach to digital learning and development  FT Tech.mp3




chunk:   0%|                                                                         | 0/3866 [00:00<?, ?it/s, now=None][A[A

chunk:   6%|███▌                                                         | 229/3866 [00:00<00:01, 2289.57it/s, now=None][A[A

chunk:  12%|███████▏                                                     | 458/3866 [00:00<00:01, 2033.08it/s, now=None][A[A

chunk:  17%|██████████▍                                                  | 664/3866 [00:00<00:01, 1976.32it/s, now=None][A[A

chunk:  22%|█████████████▌                                               | 863/3866 [00:00<00:01, 1977.79it/s, now=None][A[A

chunk:  27%|████████████████▍                                           | 1062/3866 [00:00<00:01, 1956.74it/s, now=None][A[A

chunk:  33%|███████████████████▌                                        | 1258/3866 [00:00<00:01, 1939.20it/s, now=None][A[A

chunk:  38%|██████████████████████▌                                     | 1453/3866 [00:00<00:01, 1855

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Fusion power how close are we  FT Film.mp3




chunk:   0%|                                                                        | 0/37051 [00:00<?, ?it/s, now=None][A[A

chunk:   0%|▏                                                           | 153/37051 [00:00<00:24, 1489.52it/s, now=None][A[A

chunk:   1%|▌                                                           | 342/37051 [00:00<00:21, 1713.02it/s, now=None][A[A

chunk:   1%|▊                                                           | 525/37051 [00:00<00:20, 1764.80it/s, now=None][A[A

chunk:   2%|█▏                                                          | 719/37051 [00:00<00:19, 1821.90it/s, now=None][A[A

chunk:   2%|█▍                                                          | 902/37051 [00:00<00:19, 1816.83it/s, now=None][A[A

chunk:   3%|█▊                                                         | 1107/37051 [00:00<00:19, 1886.81it/s, now=None][A[A

chunk:   4%|██▏                                                        | 1355/37051 [00:00<00:17, 2076

chunk:  38%|██████████████████████                                    | 14112/37051 [00:06<00:12, 1849.23it/s, now=None][A[A

chunk:  39%|██████████████████████▍                                   | 14314/37051 [00:06<00:11, 1896.89it/s, now=None][A[A

chunk:  39%|██████████████████████▋                                   | 14527/37051 [00:06<00:11, 1961.82it/s, now=None][A[A

chunk:  40%|███████████████████████                                   | 14759/37051 [00:07<00:10, 2066.04it/s, now=None][A[A

chunk:  40%|███████████████████████▍                                  | 14996/37051 [00:07<00:10, 2152.11it/s, now=None][A[A

chunk:  41%|███████████████████████▊                                  | 15234/37051 [00:07<00:09, 2216.70it/s, now=None][A[A

chunk:  42%|████████████████████████▏                                 | 15457/37051 [00:07<00:10, 2141.88it/s, now=None][A[A

chunk:  42%|████████████████████████▌                                 | 15688/37051 [00:07<00:09, 2187.1

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Trees farmers and the Brazilian ecosystem  FT Food Revolution.mp3




chunk:   0%|                                                                         | 0/3510 [00:00<?, ?it/s, now=None][A[A

chunk:   6%|███▊                                                         | 220/3510 [00:00<00:01, 2190.24it/s, now=None][A[A

chunk:  13%|███████▋                                                     | 440/3510 [00:00<00:01, 1902.37it/s, now=None][A[A

chunk:  18%|███████████                                                  | 633/3510 [00:00<00:01, 1820.29it/s, now=None][A[A

chunk:  23%|██████████████▏                                              | 817/3510 [00:00<00:01, 1761.75it/s, now=None][A[A

chunk:  28%|█████████████████                                           | 1000/3510 [00:00<00:01, 1782.64it/s, now=None][A[A

chunk:  34%|████████████████████▏                                       | 1181/3510 [00:00<00:01, 1784.00it/s, now=None][A[A

chunk:  39%|███████████████████████▏                                    | 1360/3510 [00:00<00:01, 1759

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/The blight that could decimate the world’s banana crop  FT Food Revolution.mp3




chunk:   0%|                                                                         | 0/4197 [00:00<?, ?it/s, now=None][A[A

chunk:   4%|██▏                                                          | 153/4197 [00:00<00:02, 1506.00it/s, now=None][A[A

chunk:   8%|████▊                                                        | 329/4197 [00:00<00:02, 1651.69it/s, now=None][A[A

chunk:  13%|███████▊                                                     | 534/4197 [00:00<00:02, 1820.16it/s, now=None][A[A

chunk:  17%|██████████▍                                                  | 716/4197 [00:00<00:01, 1819.99it/s, now=None][A[A

chunk:  21%|█████████████                                                | 898/4197 [00:00<00:01, 1810.48it/s, now=None][A[A

chunk:  26%|███████████████▍                                            | 1080/4197 [00:00<00:01, 1786.67it/s, now=None][A[A

chunk:  30%|█████████████████▉                                          | 1259/4197 [00:00<00:01, 1774

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/The Carbon Market Opportunity  FT Rethink.mp3




chunk:   0%|                                                                         | 0/3466 [00:00<?, ?it/s, now=None][A[A

chunk:   7%|████                                                         | 229/3466 [00:00<00:01, 2287.32it/s, now=None][A[A

chunk:  13%|████████                                                     | 458/3466 [00:00<00:01, 1966.01it/s, now=None][A[A

chunk:  19%|███████████▌                                                 | 658/3466 [00:00<00:01, 1882.43it/s, now=None][A[A

chunk:  24%|██████████████▉                                              | 848/3466 [00:00<00:01, 1840.58it/s, now=None][A[A

chunk:  30%|█████████████████▉                                          | 1033/3466 [00:00<00:01, 1838.02it/s, now=None][A[A

chunk:  35%|█████████████████████                                       | 1218/3466 [00:00<00:01, 1821.37it/s, now=None][A[A

chunk:  40%|████████████████████████▎                                   | 1401/3466 [00:00<00:01, 1781

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/The story behind Jake Freeman’s meme-stock bonanza  FT Due Diligence.mp3




chunk:   0%|                                                                         | 0/3453 [00:00<?, ?it/s, now=None][A[A

chunk:   6%|███▊                                                         | 219/3453 [00:00<00:01, 2166.92it/s, now=None][A[A

chunk:  13%|███████▋                                                     | 436/3453 [00:00<00:01, 2023.96it/s, now=None][A[A

chunk:  19%|███████████▎                                                 | 639/3453 [00:00<00:01, 2023.42it/s, now=None][A[A

chunk:  25%|███████████████                                              | 850/3453 [00:00<00:01, 2047.19it/s, now=None][A[A

chunk:  31%|██████████████████▎                                         | 1055/3453 [00:00<00:01, 2018.35it/s, now=None][A[A

chunk:  36%|█████████████████████▊                                      | 1258/3453 [00:00<00:01, 2009.33it/s, now=None][A[A

chunk:  42%|█████████████████████████▎                                  | 1460/3453 [00:00<00:00, 1998

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/The business of Formula 1 inside McLaren HQ  FT Scoreboard.mp3




chunk:   0%|                                                                        | 0/13110 [00:00<?, ?it/s, now=None][A[A

chunk:   1%|▊                                                           | 178/13110 [00:00<00:07, 1771.69it/s, now=None][A[A

chunk:   3%|█▋                                                          | 368/13110 [00:00<00:06, 1845.36it/s, now=None][A[A

chunk:   4%|██▌                                                         | 564/13110 [00:00<00:06, 1897.14it/s, now=None][A[A

chunk:   6%|███▍                                                        | 759/13110 [00:00<00:06, 1900.48it/s, now=None][A[A

chunk:   7%|████▍                                                       | 973/13110 [00:00<00:06, 1983.08it/s, now=None][A[A

chunk:   9%|█████▎                                                     | 1172/13110 [00:00<00:06, 1953.40it/s, now=None][A[A

chunk:  11%|██████▎                                                    | 1410/13110 [00:00<00:05, 2082

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/India and the business of womens cricket  FT Scoreboard.mp3




chunk:   0%|                                                                        | 0/19001 [00:00<?, ?it/s, now=None][A[A

chunk:   1%|▌                                                           | 175/19001 [00:00<00:10, 1749.35it/s, now=None][A[A

chunk:   2%|█▏                                                          | 364/19001 [00:00<00:10, 1831.89it/s, now=None][A[A

chunk:   3%|█▋                                                          | 548/19001 [00:00<00:10, 1793.79it/s, now=None][A[A

chunk:   4%|██▎                                                         | 728/19001 [00:00<00:10, 1785.66it/s, now=None][A[A

chunk:   5%|██▊                                                         | 907/19001 [00:00<00:10, 1778.20it/s, now=None][A[A

chunk:   6%|███▍                                                       | 1106/19001 [00:00<00:09, 1835.43it/s, now=None][A[A

chunk:   7%|████                                                       | 1309/19001 [00:00<00:09, 1898

chunk:  67%|██████████████████████████████████████▊                   | 12728/19001 [00:06<00:03, 1863.01it/s, now=None][A[A

chunk:  68%|███████████████████████████████████████▍                  | 12916/19001 [00:06<00:03, 1849.46it/s, now=None][A[A

chunk:  69%|███████████████████████████████████████▉                  | 13102/19001 [00:06<00:03, 1839.03it/s, now=None][A[A

chunk:  70%|████████████████████████████████████████▌                 | 13287/19001 [00:06<00:03, 1818.55it/s, now=None][A[A

chunk:  71%|█████████████████████████████████████████                 | 13470/19001 [00:07<00:03, 1815.04it/s, now=None][A[A

chunk:  72%|█████████████████████████████████████████▊                | 13682/19001 [00:07<00:02, 1904.28it/s, now=None][A[A

chunk:  73%|██████████████████████████████████████████▍               | 13904/19001 [00:07<00:02, 1997.09it/s, now=None][A[A

chunk:  74%|███████████████████████████████████████████               | 14107/19001 [00:07<00:02, 1991.1

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Full Harvest the online marketplace that is reinventing the supply chain  FT Food Revolution.mp3




chunk:   0%|                                                                         | 0/3722 [00:00<?, ?it/s, now=None][A[A

chunk:   5%|██▉                                                          | 182/3722 [00:00<00:01, 1818.73it/s, now=None][A[A

chunk:  10%|█████▉                                                       | 364/3722 [00:00<00:01, 1812.60it/s, now=None][A[A

chunk:  15%|█████████▏                                                   | 558/3722 [00:00<00:01, 1854.82it/s, now=None][A[A

chunk:  20%|████████████▍                                                | 756/3722 [00:00<00:01, 1903.67it/s, now=None][A[A

chunk:  25%|███████████████▌                                             | 947/3722 [00:00<00:01, 1864.07it/s, now=None][A[A

chunk:  30%|██████████████████▎                                         | 1134/3722 [00:00<00:01, 1811.90it/s, now=None][A[A

chunk:  35%|█████████████████████▏                                      | 1316/3722 [00:00<00:01, 1760

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Battling the avian flu epidemic  FT Food Revolution.mp3




chunk:   0%|                                                                         | 0/8271 [00:00<?, ?it/s, now=None][A[A

chunk:   2%|█▎                                                           | 181/8271 [00:00<00:04, 1804.88it/s, now=None][A[A

chunk:   5%|██▊                                                          | 385/8271 [00:00<00:04, 1938.90it/s, now=None][A[A

chunk:   7%|████▎                                                        | 585/8271 [00:00<00:03, 1957.88it/s, now=None][A[A

chunk:   9%|█████▊                                                       | 781/8271 [00:00<00:03, 1884.79it/s, now=None][A[A

chunk:  12%|███████▎                                                     | 986/8271 [00:00<00:03, 1935.14it/s, now=None][A[A

chunk:  14%|████████▋                                                   | 1189/8271 [00:00<00:03, 1961.89it/s, now=None][A[A

chunk:  17%|██████████                                                  | 1386/8271 [00:00<00:03, 1958

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Multi-club ownership is rising fast but not everyone’s a fan  FT Scoreboard.mp3




chunk:   0%|                                                                         | 0/3221 [00:00<?, ?it/s, now=None][A[A

chunk:   6%|███▍                                                         | 181/3221 [00:00<00:01, 1797.85it/s, now=None][A[A

chunk:  11%|██████▉                                                      | 366/3221 [00:00<00:01, 1827.82it/s, now=None][A[A

chunk:  17%|██████████▍                                                  | 549/3221 [00:00<00:01, 1819.26it/s, now=None][A[A

chunk:  23%|█████████████▊                                               | 731/3221 [00:00<00:01, 1812.10it/s, now=None][A[A

chunk:  28%|█████████████████▎                                           | 915/3221 [00:00<00:01, 1819.02it/s, now=None][A[A

chunk:  34%|████████████████████▍                                       | 1097/3221 [00:00<00:01, 1803.15it/s, now=None][A[A

chunk:  40%|████████████████████████                                    | 1290/3221 [00:00<00:01, 1833

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Falling wind speeds could affect green energy strategy  FT Rethink.mp3




chunk:   0%|                                                                         | 0/3991 [00:00<?, ?it/s, now=None][A[A

chunk:   5%|██▊                                                          | 185/3991 [00:00<00:02, 1820.21it/s, now=None][A[A

chunk:   9%|█████▋                                                       | 369/3991 [00:00<00:01, 1823.90it/s, now=None][A[A

chunk:  14%|████████▌                                                    | 561/3991 [00:00<00:01, 1848.96it/s, now=None][A[A

chunk:  19%|███████████▌                                                 | 753/3991 [00:00<00:01, 1866.20it/s, now=None][A[A

chunk:  24%|██████████████▋                                              | 962/3991 [00:00<00:01, 1937.92it/s, now=None][A[A

chunk:  29%|█████████████████▍                                          | 1156/3991 [00:00<00:01, 1936.30it/s, now=None][A[A

chunk:  34%|████████████████████▎                                       | 1350/3991 [00:00<00:01, 1895

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/The 2023 Tech M&A Outlook  FT Due Diligence.mp3




chunk:   0%|                                                                         | 0/5186 [00:00<?, ?it/s, now=None][A[A

chunk:   4%|██▋                                                          | 226/5186 [00:00<00:02, 2222.10it/s, now=None][A[A

chunk:   9%|█████▎                                                       | 449/5186 [00:00<00:02, 2116.75it/s, now=None][A[A

chunk:  13%|███████▊                                                     | 661/5186 [00:00<00:02, 2074.90it/s, now=None][A[A

chunk:  17%|██████████▏                                                  | 869/5186 [00:00<00:02, 1882.95it/s, now=None][A[A

chunk:  20%|████████████▎                                               | 1060/5186 [00:00<00:02, 1834.06it/s, now=None][A[A

chunk:  24%|██████████████▍                                             | 1245/5186 [00:00<00:02, 1779.61it/s, now=None][A[A

chunk:  27%|████████████████▍                                           | 1424/5186 [00:00<00:02, 1765

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Can UK childcare be fixed  FT Women In Business.mp3




chunk:   0%|                                                                         | 0/6300 [00:00<?, ?it/s, now=None][A[A

chunk:   4%|██▏                                                          | 232/6300 [00:00<00:02, 2302.95it/s, now=None][A[A

chunk:   7%|████▍                                                        | 463/6300 [00:00<00:02, 2212.94it/s, now=None][A[A

chunk:  11%|██████▋                                                      | 685/6300 [00:00<00:02, 2148.58it/s, now=None][A[A

chunk:  15%|████████▉                                                    | 918/6300 [00:00<00:02, 2199.09it/s, now=None][A[A

chunk:  18%|██████████▊                                                 | 1139/6300 [00:00<00:02, 2192.96it/s, now=None][A[A

chunk:  22%|████████████▉                                               | 1363/6300 [00:00<00:02, 2204.25it/s, now=None][A[A

chunk:  25%|███████████████                                             | 1584/6300 [00:00<00:02, 2200

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Planetary Boundaries What are they and why shouldnt we cross them  FT Rethink.mp3




chunk:   0%|                                                                         | 0/5026 [00:00<?, ?it/s, now=None][A[A

chunk:   6%|███▍                                                         | 281/5026 [00:00<00:01, 2790.98it/s, now=None][A[A

chunk:  11%|██████▊                                                      | 561/5026 [00:00<00:02, 2117.03it/s, now=None][A[A

chunk:  16%|█████████▌                                                   | 783/5026 [00:00<00:02, 2045.64it/s, now=None][A[A

chunk:  20%|████████████                                                 | 993/5026 [00:00<00:02, 1994.32it/s, now=None][A[A

chunk:  24%|██████████████▎                                             | 1195/5026 [00:00<00:01, 1960.90it/s, now=None][A[A

chunk:  28%|████████████████▋                                           | 1393/5026 [00:00<00:01, 1930.19it/s, now=None][A[A

chunk:  32%|██████████████████▉                                         | 1587/5026 [00:00<00:01, 1907

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Fractured markets the big threats to the financial system  FT Film.mp3




chunk:   0%|                                                                        | 0/34244 [00:00<?, ?it/s, now=None][A[A

chunk:   1%|▎                                                           | 183/34244 [00:00<00:18, 1817.56it/s, now=None][A[A

chunk:   1%|▋                                                           | 368/34244 [00:00<00:18, 1831.11it/s, now=None][A[A

chunk:   2%|▉                                                           | 557/34244 [00:00<00:18, 1835.92it/s, now=None][A[A

chunk:   2%|█▎                                                          | 749/34244 [00:00<00:17, 1868.13it/s, now=None][A[A

chunk:   3%|█▋                                                          | 936/34244 [00:00<00:18, 1808.27it/s, now=None][A[A

chunk:   3%|█▉                                                         | 1118/34244 [00:00<00:19, 1728.64it/s, now=None][A[A

chunk:   4%|██▏                                                        | 1292/34244 [00:00<00:19, 1690

chunk:  36%|█████████████████████                                     | 12424/34244 [00:06<00:11, 1833.82it/s, now=None][A[A

chunk:  37%|█████████████████████▎                                    | 12613/34244 [00:06<00:11, 1850.11it/s, now=None][A[A

chunk:  37%|█████████████████████▋                                    | 12811/34244 [00:06<00:11, 1888.16it/s, now=None][A[A

chunk:  38%|██████████████████████                                    | 13021/34244 [00:06<00:10, 1942.33it/s, now=None][A[A

chunk:  39%|██████████████████████▍                                   | 13230/34244 [00:06<00:10, 1979.75it/s, now=None][A[A

chunk:  39%|██████████████████████▊                                   | 13432/34244 [00:07<00:10, 1989.13it/s, now=None][A[A

chunk:  40%|███████████████████████                                   | 13632/34244 [00:07<00:10, 1984.24it/s, now=None][A[A

chunk:  40%|███████████████████████▍                                  | 13831/34244 [00:07<00:10, 1975.3

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/How technology is helping protect the wine sector from climate change  FT Energy Source.mp3




chunk:   0%|                                                                         | 0/3904 [00:00<?, ?it/s, now=None][A[A

chunk:   5%|██▊                                                          | 179/3904 [00:00<00:02, 1769.06it/s, now=None][A[A

chunk:   9%|█████▌                                                       | 356/3904 [00:00<00:02, 1761.51it/s, now=None][A[A

chunk:  14%|████████▎                                                    | 533/3904 [00:00<00:01, 1750.09it/s, now=None][A[A

chunk:  18%|███████████                                                  | 709/3904 [00:00<00:01, 1662.98it/s, now=None][A[A

chunk:  23%|██████████████                                               | 903/3904 [00:00<00:01, 1758.34it/s, now=None][A[A

chunk:  28%|████████████████▉                                           | 1102/3904 [00:00<00:01, 1829.33it/s, now=None][A[A

chunk:  33%|████████████████████                                        | 1305/3904 [00:00<00:01, 1883

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Could battery storage help with the US energy crisis  FT Energy Source.mp3




chunk:   0%|                                                                         | 0/3993 [00:00<?, ?it/s, now=None][A[A

chunk:   6%|███▉                                                         | 255/3993 [00:00<00:01, 2539.23it/s, now=None][A[A

chunk:  13%|███████▊                                                     | 509/3993 [00:00<00:01, 2318.26it/s, now=None][A[A

chunk:  19%|███████████▎                                                 | 743/3993 [00:00<00:01, 2235.13it/s, now=None][A[A

chunk:  24%|██████████████▊                                              | 968/3993 [00:00<00:01, 2020.71it/s, now=None][A[A

chunk:  29%|█████████████████▋                                          | 1173/3993 [00:00<00:01, 1963.99it/s, now=None][A[A

chunk:  34%|████████████████████▋                                       | 1373/3993 [00:00<00:01, 1974.08it/s, now=None][A[A

chunk:  39%|███████████████████████▌                                    | 1572/3993 [00:00<00:01, 1965

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Are we too obsessed with our personal carbon footprints  FT Rethink.mp3




chunk:   0%|                                                                         | 0/5740 [00:00<?, ?it/s, now=None][A[A

chunk:   5%|██▉                                                          | 273/5740 [00:00<00:02, 2727.58it/s, now=None][A[A

chunk:  10%|█████▊                                                       | 546/5740 [00:00<00:02, 2132.73it/s, now=None][A[A

chunk:  13%|████████▏                                                    | 768/5740 [00:00<00:02, 2079.07it/s, now=None][A[A

chunk:  17%|██████████▍                                                  | 982/5740 [00:00<00:02, 2090.25it/s, now=None][A[A

chunk:  21%|████████████▍                                               | 1194/5740 [00:00<00:02, 2095.72it/s, now=None][A[A

chunk:  24%|██████████████▋                                             | 1406/5740 [00:00<00:02, 2076.63it/s, now=None][A[A

chunk:  28%|████████████████▉                                           | 1616/5740 [00:00<00:01, 2083

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/FTX the legend of Sam Bankman-Fried  FT Film.mp3




chunk:   0%|                                                                        | 0/39535 [00:00<?, ?it/s, now=None][A[A

chunk:   0%|▎                                                           | 189/39535 [00:00<00:21, 1872.55it/s, now=None][A[A

chunk:   1%|▌                                                           | 383/39535 [00:00<00:20, 1909.97it/s, now=None][A[A

chunk:   1%|▊                                                           | 575/39535 [00:00<00:20, 1873.70it/s, now=None][A[A

chunk:   2%|█▏                                                          | 766/39535 [00:00<00:20, 1880.03it/s, now=None][A[A

chunk:   2%|█▍                                                          | 955/39535 [00:00<00:20, 1848.65it/s, now=None][A[A

chunk:   3%|█▋                                                         | 1146/39535 [00:00<00:20, 1861.47it/s, now=None][A[A

chunk:   3%|█▉                                                         | 1333/39535 [00:00<00:20, 1853

chunk:  32%|██████████████████▎                                       | 12508/39535 [00:06<00:13, 1950.81it/s, now=None][A[A

chunk:  32%|██████████████████▋                                       | 12704/39535 [00:06<00:13, 1919.85it/s, now=None][A[A

chunk:  33%|██████████████████▉                                       | 12897/39535 [00:06<00:13, 1904.84it/s, now=None][A[A

chunk:  33%|███████████████████▏                                      | 13088/39535 [00:06<00:13, 1900.39it/s, now=None][A[A

chunk:  34%|███████████████████▍                                      | 13284/39535 [00:06<00:13, 1910.40it/s, now=None][A[A

chunk:  34%|███████████████████▊                                      | 13476/39535 [00:07<00:13, 1894.31it/s, now=None][A[A

chunk:  35%|████████████████████                                      | 13673/39535 [00:07<00:13, 1905.74it/s, now=None][A[A

chunk:  35%|████████████████████▎                                     | 13873/39535 [00:07<00:13, 1932.6

chunk:  96%|███████████████████████████████████████████████████████▌  | 37911/39535 [00:19<00:00, 1748.15it/s, now=None][A[A

chunk:  96%|███████████████████████████████████████████████████████▉  | 38089/39535 [00:19<00:00, 1749.55it/s, now=None][A[A

chunk:  97%|████████████████████████████████████████████████████████▏ | 38265/39535 [00:19<00:00, 1743.69it/s, now=None][A[A

chunk:  97%|████████████████████████████████████████████████████████▍ | 38440/39535 [00:20<00:00, 1729.70it/s, now=None][A[A

chunk:  98%|████████████████████████████████████████████████████████▋ | 38614/39535 [00:20<00:00, 1719.11it/s, now=None][A[A

chunk:  98%|████████████████████████████████████████████████████████▉ | 38786/39535 [00:20<00:00, 1717.55it/s, now=None][A[A

chunk:  99%|█████████████████████████████████████████████████████████▏| 38960/39535 [00:20<00:00, 1716.90it/s, now=None][A[A

chunk:  99%|█████████████████████████████████████████████████████████▍| 39146/39535 [00:20<00:00, 1758.9

MoviePy - Done.
MoviePy - Writing audio in @FinancialTimes/Recycling the world’s hard drive waste  FT Rethink.mp3




chunk:   0%|                                                                         | 0/3996 [00:00<?, ?it/s, now=None][A[A

chunk:   4%|██▋                                                          | 179/3996 [00:00<00:02, 1766.57it/s, now=None][A[A

chunk:   9%|█████▋                                                       | 375/3996 [00:00<00:01, 1860.98it/s, now=None][A[A

chunk:  14%|████████▊                                                    | 575/3996 [00:00<00:01, 1915.99it/s, now=None][A[A

chunk:  19%|███████████▋                                                 | 769/3996 [00:00<00:01, 1920.98it/s, now=None][A[A

chunk:  24%|██████████████▋                                              | 962/3996 [00:00<00:01, 1883.38it/s, now=None][A[A

chunk:  29%|█████████████████▎                                          | 1151/3996 [00:00<00:01, 1882.69it/s, now=None][A[A

chunk:  34%|████████████████████▎                                       | 1351/3996 [00:00<00:01, 1913

## Query pinecone index for answer to question with video link in URL

In [60]:
index_id = "audio"
pinecone.init(
    api_key=pinecone_api_key,  # app.pinecone.io
    environment="us-west1-gcp"
)
if index_id not in pinecone.list_indexes():
    pinecone.create_index(
        index_id,
        dim,
        metric="dotproduct"
    )

index = pinecone.Index(index_id)
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 14005}},
 'total_vector_count': 14005}

In [61]:
query = "How to make a coating that lowers temperature?"

xq = model_embed.encode(query).tolist()

index.query(xq, top_k=5, include_metadata=True)

{'matches': [{'id': 'Make Our Best Thermal Paste YOURSELF!-t633.04',
              'metadata': {'end': 680.2,
                           'name': 'Make Our Best Thermal Paste YOURSELF!',
                           'start': 633.04,
                           'text': 'that you could do that is to simply take a '
                                   'drop of oil or liquid, put it between the '
                                   'two surfaces like this and eliminate the '
                                   'air and you will substantially improve the '
                                   "thermal conductivity. Nevertheless, you're "
                                   "limited to a material, the oil, that's "
                                   'about a thousand times worse than the '
                                   'thermal conductivity of the bulk '
                                   'materials. Now, if you could apply a '
                                   'sufficient amount of pressure, la