In [9]:
import sys
sys.path.append("../../digitech_classify")

import pandas as pd
from digitech_classify.pipeline.config import PROCESSED_DATA_DIR, DATA_DIR, RAW_DATA_DIR




In [8]:
df = pd.read_csv(PROCESSED_DATA_DIR / 'company_tagged_all-MiniLM-L6-v2.csv')
keyword_sector_mapping = pd.read_excel(DATA_DIR / 'keywords_combined_digital/Keywords_Combined.xlsx', sheet_name='Sheet1')
glass_AI = pd.read_excel(PROCESSED_DATA_DIR / 'glassAI_crunchbase_overlap.xlsx')


In [None]:
glass_AI.columns.to_list()

In [None]:

df_filtered_companies = df[df['similarity'] >= 0.6]


df_filtered_companies = df_filtered_companies.merge(
    keyword_sector_mapping,
    left_on='keyword',
    right_on='Keyword_lemmatized',
    how='left'
)



In [None]:
df_unique_companies_by_sector = df_filtered_companies.groupby(['Sector', 'organisation_name'], as_index=False).first()
df_filtered_companies_sector_group = df_unique_companies_by_sector.groupby('Sector').size().reset_index(name='count') 
print(df_unique_companies_by_sector.shape)

In [None]:
df_unique_companies_by_sector = df_unique_companies_by_sector.drop(columns=['keyword', 'Keyword_lemmatized'])

In [None]:
sector_list = df_unique_companies_by_sector['Sector'].unique().tolist()
df_unique_companies_by_sector['Sector'] = df_unique_companies_by_sector['Sector'].str.lower().str.strip()



In [None]:
sector_dfs = {sector: df_unique_companies_by_sector[df_unique_companies_by_sector['Sector'] == sector] for sector in sector_list}

In [None]:
for sector, sector_df in sector_dfs.items():
    print(f"Sector: {sector}")

In [None]:
df_unique_companies_by_sector.to_excel(DATA_DIR / 'sector_companies_enriched_agg.xlsx', index=False)


In [None]:
missing_companies = df_unique_companies_by_sector[~df_unique_companies_by_sector['org_ID'].isin(glass_AI['org_ID'])]

glass_AI_enriched = pd.concat([glass_AI, missing_companies], ignore_index=True)

In [None]:
missing_companies.shape 

In [None]:
glass_AI_enriched['sector'] = glass_AI_enriched['Sector'].combine_first(glass_AI_enriched['digital_sector'])
glass_AI_enriched = glass_AI_enriched.drop(columns=['Sector', 'digital_sector', 'id_organization', 'keywords_mentioned', 'crawling_keywords_mentioned'])

In [None]:
glass_AI_enriched['sector'] = glass_AI_enriched['sector'].str.lower().str.strip()
print(glass_AI_enriched['sector'].unique())



In [None]:
print(glass_AI_enriched['sector'].unique())
sector_mapping = {
    'Cloud to Edge to IoT': 'cloud-edge-iot',
    'Data Analytics Technologies': 'data analytics',
    'Artificial Intelligence': 'artificial intelligence',
    'Blockchain Technologies': 'blockchain',
    'Photonics Technologies': 'photonics',
    'Quantum Technologies': 'quantum technologies',
    'Robotics Technologies': 'robotics',
    'Advanced Digital Communications and Connectivity': 'advanced digital communications and connectivity',
    'High Performance Computing': 'high performance computing',
    'Next Generation Internet and Extended Reality': 'next generation internet and extended reality',
    'Microelectronics, High Frequency Chips and Semiconductors': 'microelectronics, high frequency chips and semiconductors'
    
}

glass_AI_enriched['sector'] = glass_AI_enriched['sector'].combine_first(
    glass_AI_enriched['digital_sector_glassAI'].map(sector_mapping)
)
print(glass_AI_enriched['sector'].unique())
print(glass_AI_enriched['digital_sector_glassAI'].unique())

In [None]:
glass_AI_enriched['sector'] = glass_AI_enriched['sector'].fillna(glass_AI_enriched['digital_sector_glassAI'])
glass_AI_enriched['sector'] = glass_AI_enriched['sector'].str.lower().str.strip()

In [None]:
import matplotlib.pyplot as plt

glass_AI_enriched['sector'].value_counts().plot(kind='bar', figsize=(12,6))
plt.xlabel('Sector')
plt.ylabel('Count')
plt.title('Distribution of Companies by Sector')
plt.tight_layout()
plt.show()

In [None]:
print(glass_AI_enriched['sector'].unique())


In [None]:
glass_AI_enriched['sector'] = glass_AI_enriched['sector'].replace({
    'advanced and high performance computing': 'high performance computing',
    'quantum technologies': 'quantum',
    'blockchain, distributed ledger and digital identity technologies': 'blockchain'
}) 

In [None]:
df_filtered_companies_sector_group = df_unique_companies_by_sector.groupby('Sector').size()

In [None]:

glass_AI_enriched_dedup = glass_AI_enriched.drop_duplicates(subset=['org_ID', 'sector'], keep='first').reset_index(drop=True)
print(glass_AI_enriched_dedup.shape)

In [None]:
save_path = PROCESSED_DATA_DIR / 'glass_ai_enriched.csv'
glass_AI_enriched_dedup.to_csv(save_path, index=False)