In [1]:
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
import pandas as pd
from torch import randn
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
episode_num = "E134"

In [2]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

In [4]:
df = pd.read_parquet(
    f"data/all-in-transcripts/cleaned/{episode_num}_sections_full_cleaned.parquet"
)
df_metadata = df[
    ["section_title", "section_time_stamp", "episode_title", "episode_date"]
]
df_metadata

Unnamed: 0,section_title,section_time_stamp,episode_title,episode_date
0,"Bestie intros!: Bad conference lunches, hair f...",0:00 - 12:36,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023"
1,Zuck vs. Elon cage match,12:36 - 15:50,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023"
2,Ukraine / Russia updates: underwhelming counte...,15:50 - 33:25,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023"
3,"Blinken's China visit, Biden's \""dictator\"" ga...",33:25 - 51:49,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023"
4,"RFK Jr's Rogan appearance, big pharma's impact...",51:49 - 1:09:59,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023"
5,Secondary market for depressed startup shares ...,1:09:59 - 1:16:36,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023"
6,Ford to receive $9.2B federal loan to build ou...,1:16:36 - end,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023"


In [5]:
sentence_embeddings_metadata = {}

for col in df_metadata.columns:
    sentence_embeddings_metadata[col] = []
sentence_embeddings_metadata["sentences"] = []
sentence_embeddings_metadata

{'section_title': [],
 'section_time_stamp': [],
 'episode_title': [],
 'episode_date': [],
 'sentences': []}

In [6]:
i_sentence = 0
for i_row, section in enumerate(df["section_dialogue"]):
    sentences = sent_tokenize(section)
    for sent in sentences:
        for col in df_metadata.columns:
            sentence_embeddings_metadata[col].append(df_metadata[col][i_row])
        sentence_embeddings_metadata["sentences"].append(sent)

        i_sentence += 1
    i_sentence = 0

In [8]:
sentence_embeddings_metadata_df = pd.DataFrame.from_dict(sentence_embeddings_metadata)

sentence_embeddings_metadata_df

Unnamed: 0,section_title,section_time_stamp,episode_title,episode_date,sentences
0,"Bestie intros!: Bad conference lunches, hair f...",0:00 - 12:36,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023","So wait a second, you guys."
1,"Bestie intros!: Bad conference lunches, hair f...",0:00 - 12:36,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023",I saw that you were at a Kotu conference or a ...
2,"Bestie intros!: Bad conference lunches, hair f...",0:00 - 12:36,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023","Sax, Brad, and I were both at the Coaching Sum..."
3,"Bestie intros!: Bad conference lunches, hair f...",0:00 - 12:36,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023","CO2 Kotu is, uh, and a large investor."
4,"Bestie intros!: Bad conference lunches, hair f...",0:00 - 12:36,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023","Is it a hedge fund, private equity?"
...,...,...,...,...,...
1136,Ford to receive $9.2B federal loan to build ou...,1:16:36 - end,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023",Bye-bye.
1137,Ford to receive $9.2B federal loan to build ou...,1:16:36 - end,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023","Love you, boys."
1138,Ford to receive $9.2B federal loan to build ou...,1:16:36 - end,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023",We'll let your winners ride.
1139,Ford to receive $9.2B federal loan to build ou...,1:16:36 - end,"E134: Ukraine counteroffensive, China tensions...","Jun 23, 2023","We open source it to the fans, and they've jus..."


In [16]:
sentence_embeddings_metadata_df.loc[1140]["sentences"]

'Somehow, waiting to get mercies.'

In [9]:
corpus_emb = np.zeros([len(sentence_embeddings_metadata_df.section_title), 768])
corpus_emb.shape
corpus_emb

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
i_sentence = 0
for i_row, section in enumerate(df["section_dialogue"]):
    sentences = sent_tokenize(section)
    for sent in sentences:
        corpus_emb[i_sentence, :] = model.encode(sent, convert_to_tensor=False)
        i_sentence += 1

In [12]:
corpus_emb.shape

(1141, 768)

In [41]:
np.save(
    f"../embeddings/{episode_num}_sentence_embeddings.npy",
    corpus_emb,
    allow_pickle=False,
)

In [42]:
sentence_embeddings_metadata_df.to_parquet(
    f"../embeddings/{episode_num}_sentence_embeddings_metadata.parquet"
)