In [None]:
import sys
sys.path.append("../../digitech_classify")

import numpy as np
import pandas as pd
import faiss
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt 
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D


from digitech_classify.pipeline.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR, DATA_DIR
from digitech_classify.pipeline.data_engineering.features import build_faiss_index, search_top_k, build_keyword_tag_df, apply_sector_threshold, pool_embeddings



In [None]:
company_path = INTERIM_DATA_DIR / "cleaned_companies_text.csv"
descriptions_df = pd.read_csv(company_path, usecols=['org_ID','organisation_name', 'search_text'])  

kw_df = pd.read_excel(DATA_DIR / "keywords_combined_digital/Keywords_Combined_v2.xlsx", sheet_name="Sheet1")


comp_data = np.load(PROCESSED_DATA_DIR / "company_embeddings_mpnet.npz", allow_pickle=True)
company_vectors = comp_data["embeddings"]
org_ids = comp_data["org_ID"]  

print(company_vectors.dtype, company_vectors.shape, company_vectors.flags['C_CONTIGUOUS'])


kw_data = np.load(INTERIM_DATA_DIR / "keywords_semantic_all-mpnet-base-v2.npz", allow_pickle=True)
keyword_vectors = kw_data["embeddings"]
keyword_texts = kw_data["keywords"]

print(keyword_vectors.dtype, keyword_vectors.shape, keyword_vectors.flags['C_CONTIGUOUS'])      

faiss.normalize_L2(company_vectors)   
faiss.normalize_L2(keyword_vectors) 
keyword_index = build_faiss_index(keyword_vectors)

In [None]:
TOP_K = 20
D, I = search_top_k(keyword_index, company_vectors, top_k=TOP_K, batch_size=10000)
print("Similarity search shapes:", D.shape, I.shape)   # num_companies and TOP_K results

In [None]:
tagged_df = build_keyword_tag_df(D, I, org_ids, keyword_texts, sim_threshold=0.4)
print(tagged_df.head())
tagged_df.shape

In [None]:
unique_companies = tagged_df['org_ID'].nunique()
print("Number of unique companies:", unique_companies)

In [None]:
tagged_df = tagged_df.merge(descriptions_df, on='org_ID', how='inner')
print(tagged_df.shape)

In [None]:
kw_df['Keyword'] = kw_df['Keyword'].astype(str).str.strip().str.lower()
kw_df = kw_df[kw_df['yes/no'] == 'yes']
kw_df = kw_df.drop(columns=['yes/no'])
kw_df['sector'] = kw_df['sector'].astype(str).str.strip().str.lower()


kw_map = dict(zip(kw_df['semantic search'], kw_df['sector']))
sectors = [kw_map.get(k, 'other') for k in keyword_texts]
unique_sectors = sorted(set(sectors))
sector_vectors, sector_names = [], []

In [None]:
sector_vectors, sector_names = pool_embeddings(keyword_vectors, sectors)

In [None]:
tagged_df = tagged_df.merge(kw_df[['semantic search', 'Keyword', 'sector']], left_on='keyword', right_on='semantic search', how='left')
print(tagged_df.shape)

In [None]:
tagged_df = tagged_df.drop(columns=['keyword'])

In [None]:
sector_counts_table = tagged_df['sector'].value_counts().reset_index()
sector_counts_table.columns = ['sector', 'count']
print(sector_counts_table)

sector_company = tagged_df.groupby('sector')['org_ID'].nunique().reset_index()
sector_company.columns = ['sector', 'unique_company_count']
print(sector_company)

In [None]:
tagged_df = tagged_df[tagged_df.apply(apply_sector_threshold, axis=1)]
sector_counts_post_filter = tagged_df['sector'].value_counts()
print(sector_counts_post_filter)

In [None]:
tagged_df_unique = tagged_df.drop_duplicates(subset=['org_ID', 'sector']).reset_index(drop=True)

In [None]:
sector_counts = tagged_df_unique.groupby('org_ID')['sector'].nunique()
print(sector_counts)


In [None]:
multi_sector_companies = (
    tagged_df_unique
    .drop_duplicates(subset=['org_ID', 'sector'])      # remove duplicates within sector first
    .assign(sector_count=lambda d: d.groupby('org_ID')['sector'].transform('nunique'))
    .query('sector_count > 1')                         # filter companies in multiple sectors
    .drop(columns='sector_count')                     # drop helper column
    .reset_index(drop=True)
)

In [None]:
avg_similarity_per_sector = tagged_df.groupby('sector')['similarity'].mean().reset_index()
avg_similarity_per_sector = avg_similarity_per_sector.sort_values(by='similarity', ascending=False)
print(avg_similarity_per_sector)

In [None]:
save_path_1 = PROCESSED_DATA_DIR / "company_tagged_mpnet.xlsx"
tagged_df_unique.to_excel(save_path_1, index=False)


save_path_2 = PROCESSED_DATA_DIR / "company_tagged_mpnet_multi_sector.xlsx"
multi_sector_companies.to_excel(save_path_2, index=False)