In [15]:
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
import pandas as pd
from torch import randn
import numpy as np

In [16]:
episode_num = "E135"

In [17]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

In [18]:
df = pd.read_parquet(
    f"data/all-in-transcripts/cleaned/{episode_num}_sections_full_cleaned.parquet"
)
df

Unnamed: 0,section_title,section_dialogue,section_time_stamp,episode_title,episode_date
0,Bestie intros: Friedberg fills in as moderator!,This is going to be a feisty episode. Is it tw...,0:00 - 2:45,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
1,Wagner Group rebellion,"The Wagner Group attempted coup, or potential ...",2:45 - 23:15,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
2,SCOTUS strikes down Affirmative Action,"1997 fall in '97, and it was the last year tha...",23:15 - 51:03,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
3,"Databricks acquires MosaicML for $1.3B, Inflec...",The AI frenzy continues here in Silicon Valley...,51:03 - 1:09:35,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
4,"IRL shuts down after faking 95% of users, Byju...","As Ukraine is Saks, is because you've talked a...",1:09:35 - 1:26:38,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
5,Science Corner: Understanding the NANOGrav fin...,"That came out yesterday. Okay, I'll cover this...",1:26:38 - end,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"


In [19]:
# add special starting sentence with section num and title to dialogue
for i in range(len(df)):
    row = df.loc[i]
    starter_sentence = f"Section {i+1}. "
    row.section_dialogue = starter_sentence + row.section_dialogue

# df.section_dialogue.loc[6]
df

In [20]:
df_metadata = df[
    ["section_title", "section_time_stamp", "episode_title", "episode_date"]
]
df_metadata

Unnamed: 0,section_title,section_time_stamp,episode_title,episode_date
0,Bestie intros: Friedberg fills in as moderator!,0:00 - 2:45,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
1,Wagner Group rebellion,2:45 - 23:15,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
2,SCOTUS strikes down Affirmative Action,23:15 - 51:03,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
3,"Databricks acquires MosaicML for $1.3B, Inflec...",51:03 - 1:09:35,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
4,"IRL shuts down after faking 95% of users, Byju...",1:09:35 - 1:26:38,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"
5,Science Corner: Understanding the NANOGrav fin...,1:26:38 - end,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023"


In [21]:
sentence_embeddings_metadata = {}
for col in df_metadata.columns:
    sentence_embeddings_metadata[col] = []
sentence_embeddings_metadata["sentences"] = []
sentence_embeddings_metadata
i_sentence = 0
for i_row, section in enumerate(df["section_dialogue"]):
    sentences = sent_tokenize(section)
    # print(sentences)
    for sent in sentences:
        for col in df_metadata.columns:
            sentence_embeddings_metadata[col].append(df_metadata[col][i_row])
        sentence_embeddings_metadata["sentences"].append(sent)

        i_sentence += 1
    i_sentence = 0

In [22]:
sentence_embeddings_metadata_df = pd.DataFrame.from_dict(sentence_embeddings_metadata)

sentence_embeddings_metadata_df

Unnamed: 0,section_title,section_time_stamp,episode_title,episode_date,sentences
0,Bestie intros: Friedberg fills in as moderator!,0:00 - 2:45,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023",This is going to be a feisty episode.
1,Bestie intros: Friedberg fills in as moderator!,0:00 - 2:45,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023",Is it two of us are on Greenwich Mean Time?
2,Bestie intros: Friedberg fills in as moderator!,0:00 - 2:45,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023",Two of us are in Pacific J Cal.
3,Bestie intros: Friedberg fills in as moderator!,0:00 - 2:45,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023","Still, would sleep in his head."
4,Bestie intros: Friedberg fills in as moderator!,0:00 - 2:45,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023","I'm good, actually."
...,...,...,...,...,...
973,Science Corner: Understanding the NANOGrav fin...,1:26:38 - end,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023","We'll see you next time, KCRW."
974,Science Corner: Understanding the NANOGrav fin...,1:26:38 - end,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023",I can do any of these radio bits.
975,Science Corner: Understanding the NANOGrav fin...,1:26:38 - end,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023",Love you guys.
976,Science Corner: Understanding the NANOGrav fin...,1:26:38 - end,"E135: Wagner rebels, SCOTUS ends AA, AI M&A, s...","Jun 30, 2023",We'll let your winners ride.


In [23]:
sentence_embeddings_metadata_df.loc[0]["sentences"]

'This is going to be a feisty episode.'

In [24]:
corpus_emb = np.zeros([len(sentence_embeddings_metadata_df.section_title), 768])
corpus_emb.shape
corpus_emb

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
i_sentence = 0
for i_row, section in enumerate(df["section_dialogue"]):
    sentences = sent_tokenize(section)
    for sent in sentences:
        corpus_emb[i_sentence, :] = model.encode(sent, convert_to_tensor=False)
        i_sentence += 1

In [26]:
corpus_emb.shape

(978, 768)

In [27]:
np.save(
    f"../embeddings/{episode_num}_sentence_embeddings.npy",
    corpus_emb,
    allow_pickle=False,
)

In [28]:
sentence_embeddings_metadata_df.to_parquet(
    f"../embeddings/{episode_num}_sentence_embeddings_metadata.parquet"
)