**Installing Dependencies**

In [10]:
%%capture
import requests                     # To make 'get' requests through chrome browser
from bs4 import BeautifulSoup       # To parse html file in python tree object
import pandas as pd                 # To create Dataframe and save data into JSON file

import random
import time
from pprint import pprint

from google.colab import data_table
from vega_datasets import data
data_table.enable_dataframe_formatter()

import numpy as np

# Installing Hugginface sentence transformers
!pip install sentence_transformers          # to load SBert models from Huggingface repository
from sentence_transformers import SentenceTransformer, util

# Install Huggingface Transformers models and gradio
!pip install transformers
!pip install gradio
from transformers import pipeline # to load in configuration and weights of fine-tuned model
import gradio as gr # to create UI

## Web Scraping (Done)

In [None]:
# Creating a list of 112 query pages
query_pages = [f"https://jkrishnamurti.org/jksearch?keyword=&page={i}&type=16618" for i in range(1, 113)]

In [None]:
# Use Beautifulsoup to scrape query pages for links to quotes pages
quote_pages = []

for page in query_pages:
  page = requests.get(page)
  soup = BeautifulSoup(page.content, 'html.parser')
  anchor_tags = soup.select(".quote-icon > a")
  paths = [anchor_tag['href'] for anchor_tag in anchor_tags]
  quote_pages.append(paths)


In [None]:
# Turning paths into hyperlinks
host_name = "https://jkrishnamurti.org/"
urls = []

for paths in quote_pages:
  for path in paths:
    urls.append(host_name + path)

In [None]:
# Now that we have all the urls we are interested in let's try to scrape one site
first_url = urls[0]

# Making a 'get' request
page = requests.get(first_url)
soup = BeautifulSoup(page.content, 'html.parser')

# It looks like all the quotes are in <p> tags contained with a <group-left> class
text = [paragraph.get_text() for paragraph in soup.select(".group-left p")]

# Creating speech string
speech = ' '.join(text)
speech


'Is it possible for the mind to empty itself totally of fear? Fear of any kind breeds illusion; it makes the mind dull, shallow. Where there is fear there is obviously no freedom, and without freedom there is no love at all. And most of us have some form of fear; fear of darkness, fear of public opinion, fear of snakes, fear of physical pain, fear of old age, fear of death. We have literally dozens of fears. And is it possible to be completely free of fear? We can see what fear does to each one of us. It makes one tell lies; it corrupts one in various ways; it makes the mind empty, shallow. There are dark corners in the mind which can never be investigated and exposed as long as one is afraid. Physical self-protection, the instinctive urge to keep away from the venomous snake, to draw back from the precipice, to avoid falling under the tramcar, and so on, is sane, normal, healthy. But I am asking about the psychological self-protectiveness which makes one afraid of disease, of death, o

In [None]:
quote_collection = []

# Make sure to print out everytime
for url in urls:
  # Making a 'get' request
  page = requests.get(url)
  soup = BeautifulSoup(page.content, 'html.parser')

  # It looks like all the quotes are in <p> tags contained with a <group-left> class
  text = [paragraph.get_text() for paragraph in soup.select(".group-left p")]

  # Creating speech string
  quote = ' '.join(text)
  quote_collection.append(quote)

In [None]:
# Creating dataframe of quotes and links
import pandas as pd

data = [(quote_collection[i], urls[i]) for i in range(len(quote_collection))]
df = pd.DataFrame(data, columns=["Quotes", "Links"])

In [None]:
# Saving this quote dataframe
df.to_json("krishanmurti_quotes.json")

## Quote Embeddings

In [11]:
# Loading back in our dataFrame
df = pd.read_json("krishnamurti_quotes.json")

In [12]:
%%capture
# Loading a pre-trained asymmetric semantic search model
model = SentenceTransformer("msmarco-roberta-base-v3") # best performing model

In [None]:
df.iat[0,0]

In [15]:
# Testing similirity between query and answer
query_embedding = model.encode('How do I get rid of mumps?')
passage_embedding = model.encode(df.iat[0,0])

print("Similarity:", util.cos_sim(query_embedding, passage_embedding))

Similarity: tensor([[0.1265]])


In [20]:
# Getting embeddings for each passage and adding it to dataframe.

# We will normalize these embedding so that we can just use the dot product to measure similarity
embeddings = model.encode(list(df['Quotes']), show_progress_bar=True, normalize_embeddings=True)
embeddings = [embeddings[i] for i in range(len(embeddings))]
df['Embedding'] = embeddings

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

In [21]:
# Saving Embeddings to Json
df.to_json("krishnamurti_df.json")

In [None]:
# Finding most similar passage to question
question = "What is the meaning of life"

question_embedding = model.encode(question)

sims = [util.dot_score(question_embedding, quote_embedding) for quote_embedding in df['Embedding']]

# Finding top 5 similarities and their indices
ind = np.argpartition(sims, -5)[-5:]

similar_sentences = [df['Quotes'][i] for i in ind]

  result = getattr(asarray(obj), method)(*args, **kwds)
  result = getattr(asarray(obj), method)(*args, **kwds)


In [None]:
print_df = pd.DataFrame(data = similar_sentences, columns=["Quotes"], index=range(1,6))
print_df

Unnamed: 0,Quotes
1,Living is not possible without dying. The two ...
2,Dying every day to everything that we know is ...
3,One observes the storms and crises that are mu...
4,"Living, love and death are one, not three sepa..."
5,"Death means coming to an end, coming to end of..."


In [None]:
print(max_sim, df['Quotes'][max_index])

tensor([[9.0123]]) Death means coming to an end, coming to end of all the things you are attached to: your Gods, beliefs, Church, culture, everything, and your relationship physically comes to an end. But there is always this longing, the hope that even though the physical organism comes to an end, there must be some element, some essence of me that will continue. And that fear that you may not continue must naturally affect your daily life. Therefore it is imperative, if one is at all serious, to understand what it means to live and to die.


In [None]:
# Getting the top 5 most simlar quotes

## Creating a MVP

In [None]:
# Loading in Language Model
my_model = "eliwill/distilgpt2-finetuned-final-project"
krishnamurti_generator = pipeline("text-generation",my_model)


Downloading config.json:   0%|          | 0.00/982 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.01M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

###### **Gradio Interface**
In this section, we will use Gradio's high-level `Interface` class to create the framework of our demo.

The interface class requires three parameters:
>
>1. fn = Function UI built around
2. Inputs = List of input elements
3. Outputs = List of output elements


**fn**

In [None]:
"""This function will return:
(1) An answer to the question with the syntax of Krishnamurti
(2) Dataframe containing related quotes of Krishnamurti"""

def ask_krishnamurti(question):
  answer = krishnamurti_generator(question)[0]['generated_text']
  list_of_quotes = get_similar_quotes(question)
  return answer, list_of_quotes

In [None]:
def get_similar_quotes(question):
  question_embedding = model.encode(question)
  sims = [util.dot_score(question_embedding, quote_embedding) for quote_embedding in df['Embedding']]
  ind = np.argpartition(sims, -5)[-5:]
  similar_sentences = [df['Quotes'][i] for i in ind]
  top5quotes = pd.DataFrame(data = similar_sentences, columns=["Quotes"], index=range(1,6))
  return top5quotes



**outputs**

In [None]:
demo = gr.Interface(fn=ask_krishnamurti, 
                    title="Ask Krishnamurti",
                    description="A Q/A language model that responds to queries using the syntax of Jiddu Krishnamurti",
                    inputs=[gr.Textbox(
                                        placeholder="Ask question here!", 
                                        lines=5, 
                                        max_lines=10,
                                        label="Question")],
                    
                    outputs=[gr.Textbox(
                        lines=3, 
                        max_lines=10,
                        label="Answer"
                    ),  
                        gr.DataFrame(
                        headers=["Quotes"],
                        max_rows=5,
                        interactive=False,
                        wrap=True)]
                    )

In [None]:
# Launching demo
demo.launch(inbrowser=True)

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Running on public URL: https://14858.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces: https://huggingface.co/spaces


(<gradio.routes.App at 0x7f8a86de1690>,
 'http://127.0.0.1:7865/',
 'https://14858.gradio.app')