In [1]:
import os 
from dotenv import load_dotenv
import pandas as pd
from nltk.stem import PorterStemmer
from import_keywords import clean_text

In [2]:
load_dotenv()

DATA_PATH = os.getenv("DATA_DIR")
print(f"DATA_PATH: {DATA_PATH}")


DATA_PATH: /Users/danielbivol/Library/CloudStorage/OneDrive-SharedLibraries-TechnopolisGroupLtd/4050 Monitoring industrial ecosystems - TGBE internal - Crunchbase/Merged databases


In [3]:
keywords_df = pd.read_excel(DATA_PATH + "/keywords_combined_digital/Keywords_Combined.xlsx", sheet_name="Sheet1" )
keywords_df = keywords_df[keywords_df['yes/no'] == 'yes']
keywords_df = keywords_df.drop(columns=['yes/no', 'Subcluster', 'Cluster'])
keywords_df['Keyword'] = (
    keywords_df['Keyword']
    .astype(str)           
    .str.strip()          
    .str.lower()          
)



In [4]:
dtype_dict = {'org_ID': 'string', 'organisation_name': 'string', 'short_description': 'string', 'description': 'string'}
companies_df = pd.read_csv(DATA_PATH + "/cb_net0_companies_concat.csv", usecols=['org_ID', 'organisation_name', 'short_description', 'description'], dtype=dtype_dict, index_col=False)

In [5]:
companies_df.columns.to_list()
print(f"companies_df shape:{companies_df.shape}")
print(companies_df.memory_usage(deep=True))         
print(f"Total memory: {companies_df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

companies_df shape:(4049929, 4)
Index                       132
org_ID                340457469
organisation_name     265320659
short_description     657755994
description          1109379608
dtype: int64
Total memory: 2262.99 MB


In [6]:
companies_df['search_text'] = (
    (companies_df['short_description'].fillna('') + ' ' + companies_df['description'].fillna(''))
    .str.lower()
    .str.replace(r'[^\w\s]', ' ', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)

companies_df.drop(['short_description', 'description'], axis=1, inplace=True)


In [7]:
stemmer = PorterStemmer()

def stem_text(text):
    return ' '.join([stemmer.stem(word) for word in text.lower().split()])

keywords_df['keyword_stemmed'] = keywords_df['Keyword'].apply(stem_text)
companies_df['description_stemmed'] = companies_df['search_text'].apply(stem_text)

In [8]:
companies_df['desc_stemmed_set'] = companies_df['description_stemmed'].str.split().apply(set)


companies_exploded = companies_df[['org_ID', 'organisation_name', 'description_stemmed', 'desc_stemmed_set']].explode('desc_stemmed_set')

companies_exploded = companies_exploded.rename(columns={'desc_stemmed_set': 'keyword_stemmed'})

matches = companies_exploded.merge(
    keywords_df[['Keyword', 'keyword_stemmed']],
    how='inner',
    on='keyword_stemmed'
)

In [9]:
print(f"Matches shape: {matches.shape}")
print(f"Matches memory: {matches.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

Matches shape: (188098, 5)
Matches memory: 137.72 MB


In [12]:

display(matches.sample(n=100, random_state=42))

Unnamed: 0,org_ID,organisation_name,description_stemmed,keyword_stemmed,Keyword
169819,162175.0,Sungrow Asia Pacific,sungrow manufactur energi solut for mobil sung...,photovolta,photovoltaic
12290,a7d038ee-b2bb-664b-9a92-323c572d861a,Divoom,divoom is a consum electron compani that manuf...,bluetooth,bluetooth
51554,442a5ce8-a6f2-4e34-b581-39330a6fb4fc,MARL 5G Accelerator,marl 5g is an acceler focus on deeptech produc...,robot,robotic
112090,f151cc83-5e14-4b6d-9661-48b9565781df,SAI Systems,sai system offer staf servic for the engin and...,firewal,firewall
97334,a436672e-1721-47cb-8878-12adabf39148,SolChicks,solchick is an entertain provid firm make nft ...,nft,nft
...,...,...,...,...,...
17932,f3efcaa0-8086-bed2-1978-486be28c1240,Tend.ai,tend ai is a smart cloud robot softwar platfor...,robot,robot
153524,a19bb91a-0e23-4c9f-bbf9-9816605f7a3c,Cryptobase,cryptobas develop secur softwar and oper bitco...,bitcoin,bitcoin
82258,9617ad20-443c-4412-bcca-3ec371137090,Mercurius Crypto,mercuriu crypto is a research hous that offer ...,crypto,crypto
76049,ecc7cbbc-3bbb-4130-b60f-90608576dde8,Cipholio Ventures,cipholio is a team of profession investor and ...,cryptocurr,cryptocurrencies


In [None]:
matches.to_csv(DATA_PATH + '/keyword_matches_test_cbnet0.csv', index = False)