In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np

In [0]:
spark.sql("USE CATALOG workspace")
spark.sql("USE SCHEMA med")
spark = SparkSession.builder.getOrCreate()

In [0]:
# load doc_chunks
chunks_df = spark.table("workspace.med.doc_chunks")
chunks_df = chunks_df.filter(F.col("chunk_text").isNotNull() & (F.length("chunk_text") > 0))

In [0]:
# convert to pandas
chunks_pdf = chunks_df.select(
    "chunk_id",
    "chunk_text",
    "source",
    "category",
    "title"
).toPandas()

chunks_pdf.head()

In [0]:
chunk_ids = chunks_pdf["chunk_id"].tolist()
texts = chunks_pdf["chunk_text"].tolist()

In [0]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2), # unigrams and bigrams
    min_df=2, # ignore very rare terms
    max_df=0.8, # ignore overly common terms
    max_features=50000,
)

matrix = vectorizer.fit_transform(texts)
matrix.shape

In [0]:
# k nearest neighbour search index and test on the matrix 
knn = NearestNeighbors(
    n_neighbors=10, # max num of neighbours
    metric="cosine", # cosine distance
    algorithm="brute", # brute force (compare query vector to all vectors)
    n_jobs=1
).fit(matrix)