In [1]:
import os 
from dotenv import load_dotenv
import pandas as pd

In [2]:
load_dotenv()


DATA_PATH = os.getenv("DATA_DIR")
print(f"DATA_PATH: {DATA_PATH}")


FUNDING_PATH = os.getenv("SHARE_POINT_DIR")
print(f"FUNDING_PATH: {FUNDING_PATH}")


DATA_PATH: /Users/danielbivol/Library/CloudStorage/OneDrive-TechnopolisGroupLtd/Documents/data/CB_data
FUNDING_PATH: /Users/danielbivol/Library/CloudStorage/OneDrive-SharedLibraries-TechnopolisGroupLtd/DSU one-stop shop - Crunchbase superusers - Feb2025


In [None]:
funding_df = pd.read_csv(FUNDING_PATH + "/funding_rounds.csv")
print(funding_df.columns.tolist())

In [3]:
data_path = DATA_PATH + "/PSE_Crunchbase.xlsx"

company_cols = [
    'Organization Name', 'Organization Name URL', 'Founded Date', 'Founded Date Precision',
    'Industries', 'Headquarters Location', 'Description', 'Total Patents',
    'Full Description', 'Number of Employees'
]
company_df = pd.read_excel(data_path, header=1, usecols=company_cols)


funding_cols = [
    'org_uuid', 'org_name', 'lead_investor_uuids', 'uuid',
    'investment_type', 'announced_on', 'raised_amount_usd', 'investor_count',
    'raised_amount', 'raised_amount_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_valuation_currency_code'
]
funding_df = pd.read_csv(FUNDING_PATH + "/funding_rounds.csv", usecols=funding_cols)

org_cols = ['uuid', 'cb_url']
org_df = pd.read_csv(FUNDING_PATH + "/organizations.csv", usecols=org_cols)


In [6]:
with open("cb_urls.txt", 'r') as file:
    urls_list = [line.strip().lower() for line in file if line.strip()]

print(f"Loaded {len(urls_list)} URLs from urls.txt")

filtered_funding_df = funding_df[funding_df['org_uuid'].isin(
    org_df[org_df['cb_url'].isin(urls_list)]['uuid']
)]
print(f"Filtered funding_df shape: {filtered_funding_df.shape}")

Loaded 210 URLs from urls.txt
Filtered funding_df shape: (17, 13)


In [7]:
funding_df = org_df.merge(
    funding_df,
    left_on='uuid',
    right_on='org_uuid',
    how='left',
)

In [8]:
print(company_df.columns.to_list())
print(funding_df.columns.to_list())


['Organization Name', 'Organization Name URL', 'Founded Date', 'Founded Date Precision', 'Industries', 'Headquarters Location', 'Description', 'Total Patents', 'Full Description', 'Number of Employees']
['uuid_x', 'cb_url', 'uuid_y', 'investment_type', 'announced_on', 'raised_amount_usd', 'raised_amount', 'raised_amount_currency_code', 'post_money_valuation_usd', 'post_money_valuation', 'post_money_valuation_currency_code', 'investor_count', 'org_uuid', 'org_name', 'lead_investor_uuids']


In [9]:
funding_df['cb_url'] = (
    funding_df['cb_url']
    .astype(str)
    .str.strip()
    .str.lower()
)

company_df['Organization Name URL'] = (
    company_df['Organization Name URL']
    .astype(str)
    .str.strip()
    .str.lower()
)


In [10]:
funding_df = funding_df.dropna(subset=['cb_url'])
funding_df = funding_df.rename(columns={'cb_url': 'Organization Name URL'})


In [11]:
df_merged = company_df.merge(
    funding_df,
    on='Organization Name URL',
    how='left'
)

print(df_merged.shape)

(216, 24)


In [13]:
print(f"Merged count: {len(df_merged)}")
print("company_df rows:", len(company_df))
print("Merged rows:", len(df_merged))

df_merged = df_merged.drop(
    ['uuid_x', 'org_name', 'lead_investor_uuids'],
    axis=1
)

Merged count: 216
company_df rows: 208
Merged rows: 216


In [14]:
save_path = DATA_PATH + "/PSE_Crunchbase_enriched.xlsx"

df_merged.to_excel(save_path, index=False)