### Installing necessary Libraries

In [1]:
!pip install -qq datasets
!pip install -qq faiss-gpu

### Importing all the libraries

In [3]:
import numpy as np 
import pandas as pd 
from huggingface_hub import hf_hub_url
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel
import torch
from datasets import load_dataset

### Data Preprocessing

In [4]:

df=pd.read_csv("/input/lexallepisodes/lexAllEpisodes.csv")
df.drop(["Unnamed: 0"],inplace=True,axis=1)
df.dropna(inplace=True)
df['full']=df['title']+". "+df['captions']
df.drop(["captions","title"],inplace=True,axis=1)

In [5]:
datasetndf = Dataset.from_pandas(df)
datasetndf

Dataset({
    features: ['full', '__index_level_0__'],
    num_rows: 317
})

### Helper Functions

In [6]:
def concatenate_text(datasetndf):
    return {
        "text": datasetndf["full"]
    }

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]



In [7]:
maindataset = datasetndf.map(concatenate_text)

  0%|          | 0/317 [00:00<?, ?ex/s]

In [8]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
device = torch.device("cuda")
model.to(device)

embedding = get_embeddings(str(datasetndf['full']))

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

In [9]:
embedding.shape

torch.Size([1, 768])

In [10]:
embeddings_dataset = maindataset.map(
    lambda x: {"embeddings": get_embeddings(str(x["full"])).detach().cpu().numpy()[0]}
)

  0%|          | 0/317 [00:00<?, ?ex/s]

In [11]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['full', '__index_level_0__', 'text', 'embeddings'],
    num_rows: 317
})

In [12]:
question = "Reinforcement Learning"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=True, inplace=True)
for _, row in samples_df.iterrows():
    #print(f"COMMENT: {row.text}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.full.split('.')[0]}")
#     print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

SCORE: 33.8927001953125
TITLE: Leslie Kaelbling: Reinforcement Learning, Planning, and Robotics | Lex Fridman Podcast #15

SCORE: 34.33152389526367
TITLE: Michael Littman: Reinforcement Learning and the Future of AI | Lex Fridman Podcast #144

SCORE: 35.982906341552734
TITLE: Sergey Levine: Robotics and Machine Learning | Lex Fridman Podcast #108

SCORE: 38.34904479980469
TITLE: Pieter Abbeel: Deep Reinforcement Learning | Lex Fridman Podcast #10

SCORE: 38.94475555419922
TITLE: David Silver: AlphaGo, AlphaZero, and Deep Reinforcement Learning | Lex Fridman Podcast #86



In [13]:
question = "Chess"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=True, inplace=True)
for _, row in samples_df.iterrows():
    #print(f"COMMENT: {row.text}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.full.split('.')[0]}")
#     print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

SCORE: 35.87500762939453
TITLE: Garry Kasparov: Chess, Deep Blue, AI, and Putin | Lex Fridman Podcast #46

SCORE: 41.47279739379883
TITLE: Stuart Russell: Long-Term Future of Artificial Intelligence | Lex Fridman Podcast #9

SCORE: 41.61048126220703
TITLE: Botez Sisters: Chess, Streaming, and Fame | Lex Fridman Podcast #319

SCORE: 41.697288513183594
TITLE: Magnus Carlsen: Greatest Chess Player of All Time | Lex Fridman Podcast #315

SCORE: 42.767642974853516
TITLE: Liv Boeree: Poker, Game Theory, AI, Simulation, Aliens & Existential Risk | Lex Fridman Podcast #314



In [14]:
question = "Vaccines"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=True, inplace=True)
for _, row in samples_df.iterrows():
    #print(f"COMMENT: {row.text}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.full.split('.')[0]}")
#     print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

SCORE: 30.132251739501953
TITLE: Vincent Racaniello: Viruses and Vaccines | Lex Fridman Podcast #216

SCORE: 41.365169525146484
TITLE: Albert Bourla: Pfizer CEO | Lex Fridman Podcast #249

SCORE: 46.23126983642578
TITLE: Dmitry Korkin: Computational Biology of Coronavirus | Lex Fridman Podcast #90

SCORE: 46.48963165283203
TITLE: Manolis Kellis: Biology of Disease | Lex Fridman Podcast #133

SCORE: 49.99861145019531
TITLE: Michael Mina: Rapid COVID Testing | Lex Fridman Podcast #235



In [15]:
question = "GAN"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=True, inplace=True)
for _, row in samples_df.iterrows():
    #print(f"COMMENT: {row.text}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.full.split('.')[0]}")
    print("=" * 100)
    print()

SCORE: 45.31462478637695
TITLE: Ian Goodfellow: Generative Adversarial Networks (GANs) | Lex Fridman Podcast #19

SCORE: 54.730838775634766
TITLE: Jitendra Malik: Computer Vision | Lex Fridman Podcast #110

SCORE: 57.5673942565918
TITLE: Greg Brockman: OpenAI and AGI | Lex Fridman Podcast #17

SCORE: 57.982486724853516
TITLE: Rajat Monga: TensorFlow | Lex Fridman Podcast #22

SCORE: 58.483333587646484
TITLE: Ben Goertzel: Artificial General Intelligence | Lex Fridman Podcast #103

