In [None]:
import os 
from dotenv import load_dotenv
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from import_keywords import clean_text
import numpy as np 

In [None]:
load_dotenv()

DATA_PATH = os.getenv("DATA_DIR")
print(f"DATA_PATH: {DATA_PATH}")

In [None]:
keywords_df = pd.read_excel(DATA_PATH + "/keywords_combined_digital/Keywords_Combined.xlsx", sheet_name="Sheet1")
keywords_df = keywords_df[keywords_df['yes/no'] == 'yes']
keywords_df['Keyword'] = keywords_df['Keyword'].astype(str).str.strip().str.lower()
keywords_df = keywords_df.drop(columns=['yes/no', 'Subcluster', 'Cluster'])
keywords_df['Keyword'] = (
    keywords_df['Keyword']
    .astype(str)           
    .str.strip()          
    .str.lower()          
)


companies_df = pd.read_csv(DATA_PATH + "/cb_net0_companies_concat.csv", 
    usecols=['org_ID', 'organisation_name', 'short_description', 'description', 'data_source'],
    dtype={'org_ID': 'string', 'organisation_name': 'string', 'short_description': 'string', 'description': 'string'},
    index_col=False)
companies_df = companies_df[companies_df['data_source'] != 'net0']

print(companies_df.shape)

In [None]:
companies_df['search_text'] = (
    (companies_df['short_description'].fillna('') + ' ' + companies_df['description'].fillna(''))
    .str.lower()
    .str.replace(r'[^\w\s]', ' ', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

companies_df.drop(['short_description', 'description'], axis=1, inplace=True)

In [None]:
nlp = spacy.load('en_core_web_md')
keyword_texts = keywords_df['Keyword'].unique()
keyword_docs = [nlp(kw) for kw in keyword_texts]
keyword_vectors = np.vstack([doc.vector for doc in keyword_docs if doc.has_vector and doc.vector_norm > 0])
keyword_texts = np.array([kw for kw, doc in zip(keyword_texts, keyword_docs) if doc.has_vector and doc.vector_norm > 0])

In [None]:
texts = companies_df['search_text'].tolist()
company_vectors = []

for doc in nlp.pipe(texts, batch_size=1000, n_process=1):  
    if doc.has_vector and doc.vector_norm > 0:
        company_vectors.append(doc.vector)
    else:
        company_vectors.append(None)

In [None]:
np.savez(DATA_PATH +'/tech_keyword_filtering/keyword_spacey_embeddings.npz', vectors=keyword_vectors, texts=keyword_texts)
