### 1. First, we create a `dataframe` containing quotes and their sources

In [1]:
import pandas as pd
df_name = "/content/alan_watts.json"

In [2]:
df = pd.read_json(df_name)

In [4]:
df.head(1)

Unnamed: 0,speech,source
0,A person who thinks all the time has nothing t...,https://www.organism.earth/library/document/ar...


In [5]:
speechs = df['speech']

In [7]:
speechs

0      A person who thinks all the time has nothing t...
1      Now, today, we are living in an age which is q...
2                                                       
3      This seminar about birth, death, and the unbor...
4      Now, I hope you remember that, this morning, I...
                             ...                        
129    This morning I was discussing with you some of...
130    It would be, of course, much better if this oc...
131                                                     
132    When I last talked about Zen I was discussing ...
133    The subject which has most interested me for m...
Name: speech, Length: 134, dtype: object

In [8]:
large_string = ""
for string in speechs:
  large_string += string

In [9]:
print(len(large_string))

4885368


In [10]:
%%capture
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Turning into sentences 
sentences = sent_tokenize(large_string)

In [11]:
dictionary = {"quote": sentences}

In [12]:
df = pd.DataFrame(dictionary)

### 2. Next we create add an embeddings column to dataframe

In [13]:
# enter name of df to save_embeddings to here
name = "alan-watts"
df_name = name + "_df.json"

In [14]:
%%capture
import numpy as np

# Installing Hugginface sentence transformers
!pip install sentence_transformers          # to load SBert models from Huggingface repository
from sentence_transformers import SentenceTransformer, util

In [15]:
%%capture
# Loading a pre-trained symmetric semantic search model
model = SentenceTransformer("all-mpnet-base-v2") # best performing model

In [16]:
# Getting embeddings for each passage and adding it to dataframe.

# We will normalize these embedding so that we can just use the dot product to measure similarity
embeddings = model.encode(list(df['quote']), show_progress_bar=True, normalize_embeddings=True)
embeddings = [embeddings[i] for i in range(len(embeddings))]
df['Embedding'] = embeddings

Batches:   0%|          | 0/1628 [00:00<?, ?it/s]

In [17]:
# Saving Embeddings to Json
df.to_json(df_name)

In [18]:
from google.colab import files
files.download(df_name) 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>