In [1]:
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer

#Note that this uses cosine distance (opposite of cosine similarity, e.g., 1-(cos_distance) = cos_similarity)
from scipy.spatial.distance import cosine

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Some example transcript data to illustrate
transcripts = pd.DataFrame()

transcripts = transcripts._append({'Text':'Ok this is the answer','Speaker':'A'},ignore_index=True)
transcripts = transcripts._append({'Text':'are you sure','Speaker':'B'},ignore_index=True)
transcripts = transcripts._append({'Text':'yea yea Im sure','Speaker':'A'},ignore_index=True)
transcripts = transcripts._append({'Text':'No not like that','Speaker':'C'},ignore_index=True)


transcripts

Unnamed: 0,Text,Speaker
0,Ok this is the answer,A
1,are you sure,B
2,yea yea Im sure,A
3,No not like that,C


In [5]:
# Load a pre-trained sentence transformer model
bert_model = SentenceTransformer('all-mpnet-base-v2')

#Function to get embedding for text from bert model
def get_embedding_bert(text):
    text = text.replace("\n", " ")
    embedding = bert_model.encode(text)
    return list(embedding)

In [6]:
#Get embeddings for all transcripts
transcripts['bert_embedding'] = transcripts['Text'].apply(lambda x: get_embedding_bert(x))

In [9]:
len(get_embedding_bert('ok'))

768

In [7]:
transcripts

Unnamed: 0,Text,Speaker,bert_embedding
0,Ok this is the answer,A,"[-0.049901105, -0.065872766, -0.027497113, -0...."
1,are you sure,B,"[0.047631353, -0.0055989586, -0.01333505, 0.05..."
2,yea yea Im sure,A,"[0.0428967, -0.0346221, -0.0034513443, 0.04122..."
3,No not like that,C,"[-0.0017823151, -0.026810594, -0.018305428, 0...."


In [10]:
#Show how to compute Cosine Distance for first and second embeddings
emb0 = transcripts.loc[0,'bert_embedding']
emb1 = transcripts.loc[1,'bert_embedding']
emb2 = transcripts.loc[2,'bert_embedding']
emb3 = transcripts.loc[3,'bert_embedding']



print(len(emb0)) #768 Dim Vector
print(type(emb0)) #Saved as a lisst
print(type(emb0[0])) #Of Floats

768
<class 'list'>
<class 'numpy.float32'>


In [7]:
distance_0_1 = cosine(emb0, emb1) #Use Cosine function to get distance
print(distance_0_1) #So distance between text 0 and text 1 is ~.84

0.8356765955686569


In [8]:
distance_1_2 = cosine(emb1, emb2)
print(distance_1_2) #And distance between text 1 and text 2 is ~.54

0.5398077964782715


So as expected here, the distance beetween transcript 0 and 1 ("ok this is the answer" -> "are you sure") is much greater than the distance between transcript 1 and 2 ("are you sure" -> "yea ya Im sure"), which makes sense bc these last 2 utterances are much closer.