In [1]:
import pandas as pd
import requests
import numpy as np
import faiss 
import ollama

In [None]:
# Getting the raw data 
df = pd.read_csv('Jsons/books.csv')

# Turn it into vector (Embedding process)

In [None]:
df

In [None]:
def textual_representation(row):
    textual_representation = f"""Title: {row["title"]}
Authors: {row['authors']}
Categories: {row['categories']}
Description: {row["description"]}
Published Year: {row['published_year']}
Rating: {row['average_rating']}
Pages: {row['num_pages']}
    
"""
    return textual_representation

In [None]:
# iloc is safere and you and do both row and column [:5 , : 3]
# apply take a function and apply each row through it
# axis = 1 mean we taking row and not column. column = axis = 0
print(df.iloc[:5].apply(textual_representation, axis=1).values[1])



In [None]:
# create a new df column called "textual_representation". 
# apply every single row in df to the function textual_represeation 
df["textual_representation"] = df.apply(textual_representation, axis=1)

In [None]:
# dimension of the embedding each vector
# A 4096-dimensional vector is just a list with 4096 numbers inside.
# FAISS = Facebook AI Similarity Search.
# i’s a library built by Meta (Facebook) for very fast searching of vectors (embeddings).
#“L2” = it uses L2 distance (Euclidean distance) to measure similarity:
dim = 1024
index = faiss.IndexFlatL2(dim)

#zeroes is a numpy command that create an array that filled with 0. 
# we gonna have a dimension of row = df ['textual_representation'] and column dimension for storing the embedding

x = np.zeros((len(df['textual_representation']), dim), dtype = 'float32')

for i , representation in enumerate(df['textual_representation']):
    # send a post request to a specfic url
    """api/embeddings is the route for getting embeddings from the model.
So you’re asking the model: “Please give me the embedding for this text.”"""
    res = requests.post('http://localhost:11434/api/embeddings',
                        json = {
                            # this is the data you send 
                            'model' : 'mxbai-embed-large',
                            'prompt' : representation
                        })
    embeded = res.json()["embedding"]
    
    x[i] = np.array(embeded)

index.add(x)

In [8]:
faiss.write_index(index, "index")

In [9]:
index = faiss.read_index("index")

In [None]:
df[df.title.str.contains("Friends")]

In [None]:
fav_book = df.iloc[4533]

In [None]:
fav_book

In [None]:
print(fav_book['textual_representation'])

In [None]:
res = requests.post('http://localhost:11434/api/embeddings',
                        json = {
                            # this is the data you send 
                            'model' : 'mxbai-embed-large',
                            'prompt' : fav_book['textual_representation']
                        })

In [None]:
embedding = np.array([res.json()['embedding']], dtype='float32')


In [None]:
D, I = index.search(embedding, 10)


In [None]:
best_matches = np.array(df['textual_representation'])[I.flatten()]

In [None]:
for match in best_matches:
    print(match)
    print