In [None]:
import sys
sys.path.append("../../digitech_classify")

import numpy as np
import pandas as pd
import faiss
from sklearn.preprocessing import normalize

from digitech_classify.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR
from digitech_classify.features import build_faiss_index, search_top_k, build_keyword_tag_df



In [None]:
comp_data = np.load(INTERIM_DATA_DIR / "company_embeddings_all-MiniLM-L6-v2.npz", allow_pickle=True)
company_vectors = comp_data["embeddings"]
org_ids = comp_data["org_ID"]  

print(company_vectors.dtype, company_vectors.shape, company_vectors.flags['C_CONTIGUOUS'])


In [None]:

kw_data = np.load(INTERIM_DATA_DIR / "keywords_embeddings_all-MiniLM-L6-v2.npz", allow_pickle=True)
keyword_vectors = kw_data["embeddings"]
keyword_texts = kw_data["keywords"]

print(keyword_vectors.dtype, keyword_vectors.shape, keyword_vectors.flags['C_CONTIGUOUS'])      

In [None]:
faiss.normalize_L2(company_vectors)   
faiss.normalize_L2(keyword_vectors) 

In [None]:

keyword_index = build_faiss_index(keyword_vectors)

In [None]:
TOP_K = 5
D, I = search_top_k(keyword_index, company_vectors, top_k=TOP_K, batch_size=10000)
print("Similarity search shapes:", D.shape, I.shape)   # num_companies and TOP_K results

In [None]:
tagged_df = build_keyword_tag_df(D, I, org_ids, keyword_texts, sim_threshold=0.6)
print(tagged_df.head())

In [None]:
tagged_df.shape

In [None]:
company_path = INTERIM_DATA_DIR / "cleaned_companies_text.csv"
descriptions_df = pd.read_csv(company_path, usecols=['org_ID','organisation_name', 'search_text'])  
print(descriptions_df.columns.to_list())

In [None]:
tagged_df = tagged_df.merge(descriptions_df, on='org_ID', how='inner')

In [None]:
save_path = PROCESSED_DATA_DIR / "company_tagged_all-MiniLM-L6-v2.csv"
tagged_df.to_csv(save_path, index=False)