In [None]:
import os 
from dotenv import load_dotenv
import pandas as pd


import matplotlib.pyplot as plt




In [None]:
load_dotenv()
DATA_PATH = os.getenv("DATA_DIR")
print(f"DATA_PATH: {DATA_PATH}")


In [None]:
glassAI_df = pd.read_csv(DATA_PATH + "/glassAI_data.csv") 
crunchbase_df = pd.read_csv(DATA_PATH + "/processed/final_dataset.csv")
print(crunchbase_df.shape, glassAI_df.shape)

In [None]:
companies_df = pd.read_csv(DATA_PATH + "/raw/cb_net0_companies_concat.csv",  # type: ignore
    usecols=['org_ID', 'cb_url','short_description', 'description'],
    dtype={'org_ID': 'string', 'cb_url': 'string', 'short_description': 'string', 'description': 'string'},
    index_col=False)


print(companies_df.shape)

In [None]:
crunchbase_df = crunchbase_df.merge(
    companies_df[['org_ID', 'cb_url']],
    left_on='org_ID',
    right_on='org_ID',
    how='left'
)

In [None]:
print(crunchbase_df.shape)

In [None]:
def clean_str(s):
    return (
        s.astype(str)
         .str.lower()
         .str.strip()
         .str.replace(r'[^\w\s]', '', regex=True)  
         .str.replace(r'\s+', ' ', regex=True)     
         .fillna('')
    )

glassAI_df = glassAI_df.rename(columns={'name': 'organisation_name'})
glassAI_df['organisation_name'] = clean_str(glassAI_df['organisation_name'])
crunchbase_df['organisation_name'] = clean_str(crunchbase_df['organisation_name'])

glassAI_df = glassAI_df.drop_duplicates(subset='id_organization')





In [None]:
glassAI_df = glassAI_df.rename(columns={'name': 'organisation_name'})
glassAI_df = glassAI_df.rename(columns={'crunchbase_url': 'cb_url'})
glassAI_df = glassAI_df[glassAI_df['cb_url'].notna()]
print(glassAI_df.shape)

In [None]:

merged_df = crunchbase_df.merge(glassAI_df, on='cb_url', how='inner')
print(merged_df.shape)

print(len(merged_df['org_ID'].unique()))

In [None]:
print(merged_df.shape)

In [None]:
unmatched_glassAI = glassAI_df[~glassAI_df['cb_url'].isin(merged_df['cb_url'])]
print(f"Unmatched glassAI entries: {unmatched_glassAI.shape[0]}")

unmatched_glassAI = unmatched_glassAI.merge(companies_df, on='cb_url', how='left')
print(unmatched_glassAI.shape)


In [None]:
save_path = DATA_PATH + "/processed/glassAI_crunchbase_outer.xlsx"
merged_df.to_excel(save_path, index=False)

In [None]:
glassAI_companies_merge = glassAI_df.merge(
    companies_df,
    left_on='cb_url',
    right_on='cb_url',
    how='inner'
)
print(glassAI_companies_merge.shape)

In [None]:
sector_counts = unmatched_glassAI['digital_sector'].value_counts()
plt.figure(figsize=(10,11))
sector_counts.plot(kind='bar')
plt.xlabel('Digital Sector')
plt.ylabel('Number of Companies')
plt.title('Distribution of Sectors in unmatched_glassAI')
plt.tight_layout()
plt.show()

In [None]:
sector_counts_merged = merged_df['digital_sector'].value_counts()
plt.figure(figsize=(10,11))
sector_counts_merged.plot(kind='bar')
plt.xlabel('Digital Sector')
plt.ylabel('Number of Companies')
plt.title('Distribution of Sectors in merged_df')
plt.tight_layout()
plt.show()

In [None]:
sector_counts_cb = crunchbase_df['Sector'].value_counts()
plt.figure(figsize=(10,11))
sector_counts_cb.plot(kind='bar')
plt.xlabel('Sector')
plt.ylabel('Number of Companies')
plt.title('Distribution of Sectors in crunchbase_df')
plt.tight_layout()
plt.show()

In [None]:
merged_df.columns

In [None]:
merged_df = merged_df.drop(columns=['organisation_name_y', 'keywords_lemma'])


merged_df = merged_df.rename(columns={
    'organisation_name_x': 'organisation_name',
    'Sector': 'digital_sector',
    'digital_sector': 'digital_sector_glassAI'
})


In [None]:
merged_df.shape

In [None]:
save_path = DATA_PATH + "/processed/glassAI_crunchbase_overlap.xlsx"

merged_df.to_excel(save_path, index=False)
