In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.chdir('..')

In [None]:
import polars as pl
from dotenv import load_dotenv
from pypi_scout.config import Config
from pypi_scout.data.description_cleaner import DescriptionCleaner, CLEANING_FAILED
from pypi_scout.data.reader import DataReader
from sentence_transformers import SentenceTransformer
from pypi_scout.vector_database import VectorDatabaseInterface

load_dotenv()
config = Config()

# Load dataset and model
df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)

# Initialize vector database interface
vector_database_interface = VectorDatabaseInterface(
    pinecone_token=config.PINECONE_TOKEN,
    pinecone_index_name=config.PINECONE_INDEX_NAME,
    embeddings_model=model,
    pinecone_namespace=config.PINECONE_NAMESPACE,
)

In [None]:
with pl.Config(fmt_str_lengths=100):
    display(df.head(10))

In [None]:
query = "find unused packages"

In [None]:
df_matches = vector_database_interface.find_similar(query, top_k=100)
df_matches = df_matches.join(df, how="left", on="name")
df_matches = df_matches.sort("similarity", descending=True)

In [None]:
df_matches

In [None]:
# Rank the columns
df_matches = df_matches.with_columns(
    rank_similarity=pl.col("similarity").rank("dense", descending=False),
    rank_weekly_downloads=pl.col("weekly_downloads").rank("dense", descending=False)
)

df_matches = df_matches.with_columns(
    normalized_similarity=(pl.col("rank_similarity") - 1) / (df_matches['rank_similarity'].max() - 1),
    normalized_weekly_downloads=(pl.col("rank_weekly_downloads") - 1) / (df_matches['rank_weekly_downloads'].max() - 1)
)

df_matches = df_matches.with_columns(
    score=0.5 * pl.col("normalized_similarity") + 0.5 * pl.col("normalized_weekly_downloads")
)

# Sort the DataFrame by the combined score in descending order
df_matches = df_matches.sort("score", descending=True)

In [None]:
df_matches.sort("score", descending=True)