In [11]:
import os 
from dotenv import load_dotenv
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from import_keywords import clean_text

In [12]:
load_dotenv()

DATA_PATH = os.getenv("DATA_DIR")
print(f"DATA_PATH: {DATA_PATH}")


DATA_PATH: /Users/danielbivol/Library/CloudStorage/OneDrive-SharedLibraries-TechnopolisGroupLtd/4050 Monitoring industrial ecosystems - TGBE internal - Crunchbase/Merged databases


In [None]:
keywords_df = pd.read_excel(DATA_PATH + "/keywords_combined_digital/Keywords_Combined.xlsx", sheet_name="Sheet1")
keywords_df = keywords_df[keywords_df['yes/no'] == 'yes']
keywords_df = keywords_df.drop(columns=['yes/no', 'Subcluster', 'Cluster'])
keywords_df['Keyword'] = (
    keywords_df['Keyword']
    .astype(str)           
    .str.strip()          
    .str.lower()          
)


companies_df = pd.read_csv(DATA_PATH + "/cb_net0_companies_concat.csv", 
    usecols=['org_ID', 'organisation_name', 'short_description', 'description', 'data_source'],
    dtype={'org_ID': 'string', 'organisation_name': 'string', 'short_description': 'string', 'description': 'string'},
    index_col=False)
companies_df = companies_df[companies_df['data_source'] != 'net0']

print(companies_df.shape)

(4049929, 5)


In [16]:
companies_df['search_text'] = (
    (companies_df['short_description'].fillna('') + ' ' + companies_df['description'].fillna(''))
    .str.lower()
    .str.replace(r'[^\w\s]', ' ', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

companies_df.drop(['short_description', 'description'], axis=1, inplace=True)

In [17]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'tagger'])  
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

In [18]:
patterns = [nlp.make_doc(text) for text in keywords_df['Keyword'].unique()]
matcher.add("KEYWORDS", patterns)

In [19]:
def extract_matches(text):
    doc = nlp(text)
    matches = matcher(doc)
    return list(set([doc[start:end].text.lower() for _, start, end in matches]))

companies_df['matched_keywords'] = companies_df['search_text'].apply(extract_matches)



In [21]:
matches_exploded = (
    companies_df[['org_ID', 'organisation_name', 'search_text', 'matched_keywords']]
    .explode('matched_keywords')
    .dropna(subset=['matched_keywords'])
    .merge(keywords_df[['Keyword']], left_on='matched_keywords', right_on='Keyword', how='left')
)


In [None]:
print(f"Matches shape: {matches_exploded.shape}")

Matches shape: (371005, 5)


In [23]:
matches_exploded.to_csv(DATA_PATH + '/tech_keyword_matches_spacey.csv', index = False)