In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from sentence_transformers import SentenceTransformer
import torch
import json
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

In [None]:
def get_device():
    if os.name == "nt":
        device = torch.device("cuda" if torch.cuda.is_available else "cpu")
    else:
        device = torch.device("mps" if torch.backends.mps.is_available else "cpu")
    return device

In [None]:
def read_Lightcast_catalogue(data_path):
    with open(data_path, 'r') as file:
        data = json.load(file)
    return json.loads(data)

In [None]:
def make_vectors(Lightcast_data, model):
    names = [datum["name"] for datum in Lightcast_data]
    vectors = model.encode(names,show_progress_bar=True)
    return vectors

In [None]:
def vectorize_data(Lightcast_data, collection_name, model):

    qdrant_client = QdrantClient(os.getenv("QDRANT_URL"))

    vectors = make_vectors(Lightcast_data, model)
    qdrant_client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=model.get_sentence_embedding_dimension(),
                                    distance=Distance.COSINE)
    )

    if "skills" in collection_name:
        payload = [
            {
                "name":datum["name"],
                "id":datum["id"],
                "infoUrl":datum["infoUrl"],
            }
            for datum in Lightcast_data 
        ]
    elif "jobs" in collection_name:
        payload = [
            {
                "name":datum["name"],
                "id":datum["id"],
            }
            for datum in Lightcast_data 
        ]

    qdrant_client.upload_collection(
        collection_name=collection_name,
        vectors=vectors,
        payload=payload,
        ids=None,
        batch_size=1024
    )


In [None]:
model = SentenceTransformer(os.getenv("EMBEDDING_MODEL"),device=get_device())

## Vectorize skills and jobs

In [None]:
def main(skills=True, jobs=False):
    if skills:
        data_path = os.getenv("DATA_SKILLS")
        print("Vectorizing skills")
        skills = read_Lightcast_catalogue(data_path)["data"]
        model_name = os.getenv("EMBEDDING_MODEL").split('/')[1]
        collection_name = '_'.join([data_path.split('/')[-1].split('.')[0],model_name])
        print(f"""Vectorizing skills to collection {collection_name}""")
        vectorize_data(skills, collection_name, model)
    
    if jobs:
        print("Vectorizing jobs")
        data_path = os.getenv("DATA_JOBS")
        jobs = read_Lightcast_catalogue(data_path)["data"]
        model_name = os.getenv("EMBEDDING_MODEL").split('/')[1]
        collection_name = '_'.join([data_path.split('/')[-1].split('.')[0],model_name])
        print(f"""Vectorizing jobs to collection {collection_name}""")
        vectorize_data(jobs, collection_name, model)


In [None]:
main(skills=True, jobs=True)