In [None]:
import os 
from dotenv import load_dotenv
import pandas as pd
from nltk.stem import PorterStemmer
from import_keywords import clean_text

In [None]:
load_dotenv()

DATA_PATH = os.getenv("DATA_DIR")
print(f"DATA_PATH: {DATA_PATH}")


In [None]:
keywords_df = pd.read_excel(DATA_PATH + "/keywords_combined_digital/Keywords_Combined.xlsx", sheet_name="Sheet1" )
keywords_df = keywords_df[keywords_df['yes/no'] == 'yes']
keywords_df = keywords_df.drop(columns=['yes/no', 'Subcluster', 'Cluster'])
keywords_df['Keyword'] = (
    keywords_df['Keyword']
    .astype(str)           
    .str.strip()          
    .str.lower()          
)

In [None]:
dtype_dict = {'org_ID': 'string', 'organisation_name': 'string', 'short_description': 'string', 'description': 'string'}
companies_df = pd.read_csv(DATA_PATH + "/cb_net0_companies_concat.csv", usecols=['org_ID', 'organisation_name', 'short_description', 'description'], dtype=dtype_dict, index_col=False)

In [None]:
companies_df.columns.to_list()
print(f"companies_df shape:{companies_df.shape}")
print(companies_df.memory_usage(deep=True))         
print(f"Total memory: {companies_df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

In [None]:
companies_df['search_text'] = (
    (companies_df['short_description'].fillna('') + ' ' + companies_df['description'].fillna(''))
    .str.lower()
    .str.replace(r'[^\w\s]', ' ', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

companies_df.drop(['short_description', 'description'], axis=1, inplace=True)


In [None]:
stemmer = PorterStemmer()

def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in text.lower().split()])

keywords_df['keyword_stemmed'] = keywords_df['Keyword'].apply(stem_text)
companies_df['description_stemmed'] = companies_df['search_text'].apply(stem_text)

In [None]:
companies_df['desc_stemmed_set'] = companies_df['description_stemmed'].str.split().apply(set)


companies_exploded = companies_df[['org_ID', 'organisation_name', 'description_stemmed', 'desc_stemmed_set']].explode('desc_stemmed_set')

companies_exploded = companies_exploded.rename(columns={'desc_stemmed_set': 'keyword_stemmed'})

matches = companies_exploded.merge(
    keywords_df[['Keyword', 'keyword_stemmed']],
    how='inner',
    on='keyword_stemmed'
)

In [None]:
print(f"Matches shape: {matches.shape}")
print(f"Matches memory: {matches.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

In [None]:

display(matches.sample(n=100, random_state=42))

In [None]:
matches.to_csv(DATA_PATH + '/tech_keyword_matches_spacey.csv', index = False)