In [1]:
import pandas as pd
import json
import concurrent.futures
from processing import process_url

## Load Public Company Data

In [2]:
df = pd.read_csv('public_company_features_final.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
# Convert string to list for 'description_embedding Competitor'
df["value_proposition_embedding"] = df["value_proposition_embedding"].apply(
    lambda x: [float(x) for x in x.replace("[", "").replace("]", "").split()]
)
filtered_df = pd.read_csv('snp_public_companies_final.csv')
# active_companies = list(filtered_df['company_id'].unique())
filtered_df.rename(columns={'company_id': 'id'}, inplace=True)
df = pd.merge(df, filtered_df[['id', 'companytype', 'companystatustype']],
              on =['id'], how='inner')
df

Unnamed: 0,id,name,ticker,exchange,hq,value_proposition,industry,vertical,target_audience,market,value_proposition_embedding,companytype,companystatustype
0,573678193,"09WOMEN Co., Ltd.",A366030,KOSDAQ,South Korea,"09WOMEN Co., Ltd. helps women feel confident a...",Consumer Goods,Ecommerce,consumers,B2C,"[0.156416118, -0.00892397761, 0.749229372, -0....",Public Company,Operating
1,25617678,1000mercis,ALMIL,Euronext Paris,France,Numberly helps businesses transform their digi...,Advertisements,MarTech,marketing teams,B2B,"[0.114381269, 0.332636148, 0.602183104, 0.2235...",Public Company,Operating
2,20297001,104 Corporation,3130,Taiwan Stock Exchange,Taiwan,104 Corporation helps job seekers and employer...,Advertisements,OTHER,hr teams,B2B,"[-0.0122938948, 0.414331198, 0.464051723, 0.16...",Public Company,Operating
3,706460426,10X Capital Venture Acquisition Corp. II,AAGR,Nasdaq Global Market,United States,10X Capital Venture Acquisition Corp. II helps...,Finance,FinTech,c-levels,B2B,"[-0.384676039, 0.181011468, 0.64754796, 0.1493...",Public Company,Acquired
4,706460601,10X Capital Venture Acquisition Corp. III,VCXB,NYSE MKT LLC,United States,10X Capital Venture Acquisition Corp. III help...,Finance,FinTech,investors,B2B,"[-0.41517812, 0.11140652, 0.632222295, 0.16715...",Public Company,Operating Subsidiary
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15716,865192,Zydus Lifesciences Limited,ZYDUSLIFE,National Stock Exchange of India,India,Zydus Lifesciences Limited helps patients and ...,Health Care,HealthTech,consumers,B2C,"[-0.0745946318, 0.219399974, 0.342252433, 0.25...",Public Company,Operating
15717,20334150,Zydus Wellness Limited,531335,Mumbai Stock Exchange,India,Zydus Wellness Limited helps consumers achieve...,Consumer Goods,OTHER,consumers,B2C,"[0.0383488126, 0.194696829, 0.544707298, 0.275...",Public Company,Operating Subsidiary
15718,43279463,Zymeworks Inc.,ZYME,Nasdaq Global Select,Canada,Zymeworks Inc. helps patients with difficult t...,Health Care,HealthTech,consumers,B2C,"[-0.0635716319, 0.282907724, 0.103864908, 0.07...",Public Company,Operating
15719,3103657,"Zynex, Inc.",ZYXI,Nasdaq Global Select,United States,"Zynex, Inc. helps patients suffering from pain...",Health Care,HealthTech,consumers,B2C,"[-0.401839107, 0.335190654, 0.282194048, -0.06...",Public Company,Operating


In [3]:
import json
with open('region_mapping.json', 'r') as f:
    # Load JSON data from the file
    region_mapping = json.load(f)

print(region_mapping['USA'])

{'NYSE MKT LLC': 1.0, 'Nasdaq Global Market': 1.0, 'Nasdaq Global Select': 1.0, 'Nasdaq Capital Market': 1.0, 'The Toronto Stock Exchange': 0.75, 'London Stock Exchange': 0.75, 'London Stock Exchange AIM Market': 0.5, 'SIX Swiss Exchange': 0.5, 'XETRA Trading Platform': 0.5, 'National Stock Exchange of India': 0, 'Mumbai Stock Exchange': 0, 'The Tokyo Stock Exchange': 0, 'Euronext Paris': 0.5, 'Euronext Brussels': 0.5, 'Euronext Amsterdam': 0.5, 'Australian Securities Exchange': 0.5, 'Bolsa de Madrid': 0.5, 'Bolsa de Valores de Sao Paulo': 0.5, 'The Stock Exchange of Hong Kong Ltd.': 0, 'Singapore Exchange': 0, 'OMX Nordic Exchange Copenhagen': 0.5, 'OMX Nordic Exchange Helsinki': 0.5, 'OMX Nordic Exchange Stockholm': 0.5, 'Warsaw Stock Exchange': 0.5, 'Indonesia Stock Exchange': 0, 'The Tel-Aviv Stock Exchange': 0}


## Prepare Private Company Data

In [4]:
def process_row(row):
    url = row['URL']  
    id = 'Not Applicable'
    name = row["Company"]
    ticker = 'Not Applicable'
    exchange = 'Not Applicable'
    description = 'Not Applicable'
    hq = row['Country']
    return process_url(url, id, name, description, ticker, exchange, hq)


In [5]:
pdf = pd.read_csv('private_companies.csv')

# Initialize an empty list to store dictionaries
results = []

# Process the rows asynchronously using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Map the process_row function to each row in the DataFrame
    futures = executor.map(process_row, pdf.to_dict(orient='records'))

    # Iterate through results
    for future in futures:
        if future:
            results.append(future)


2024-04-16 23:50:07,476 - INFO - Scraping URL: https://www.voyantis.ai/
2024-04-16 23:50:07,477 - INFO - Scraping URL: https://www.rapyd.net/
2024-04-16 23:50:07,478 - INFO - Scraping URL: https://utopiamusic.com/
2024-04-16 23:50:07,479 - INFO - Scraping URL: https://www.pragmaticplay.com/en/
2024-04-16 23:50:07,481 - INFO - Scraping URL: https://khealth.com/
2024-04-16 23:50:07,483 - INFO - Scraping URL: https://fire-arc.com/
2024-04-16 23:50:07,484 - INFO - Scraping URL: https://www.511tactical.com/
2024-04-16 23:50:07,485 - INFO - Scraping URL: https://www.ahava.com/
2024-04-16 23:50:07,487 - INFO - Scraping URL: https://www.amagi.com/
2024-04-16 23:50:07,488 - INFO - Scraping URL: https://www.miya-water.com/en/
2024-04-16 23:50:07,489 - INFO - Scraping URL: https://www.monicavinader.com/
2024-04-16 23:50:07,491 - INFO - Scraping URL: https://www.sumup.com/en-gb/
2024-04-16 23:50:07,492 - INFO - Scraping URL: https://hireup.com.au/
2024-04-16 23:50:07,493 - INFO - Scraping URL: htt

In [6]:
from tqdm import tqdm
import os
from generate_features import get_company_features_guidance
from models import init_model

llm = init_model()

print("Fetching company features")
# Check if the JSON file already exists
if os.path.exists('private_companies_feature_engineering.json'):
    with open('private_companies_feature_engineering.json', 'r') as f:
        results = json.load(f)
else:
    llm = init_model()

    print("Fetching company features")
    for idx in tqdm(range(len(results))):
        
        meta_title = results[idx].get('title', None)
        meta_description = results[idx].get('description', None)
        website_body = results[idx].get('body', None)

        company_features = get_company_features_guidance(llm,
                                                 comp_name=results[idx]['name'], 
                                                 comp_ticker=results[idx]['ticker'], 
                                                 comp_hq=results[idx]['hq'], 
                                                 meta_title=meta_title,
                                                 meta_description=meta_description,
                                                 website_body=website_body)

        results[idx]['value_proposition'] = company_features.get('value_proposition', 'Not Found')
        results[idx]['industry'] = company_features.get('industry', 'Not Found')
        results[idx]['vertical'] = company_features.get('vertical', 'Not Found')
        results[idx]['target_audience'] = company_features.get('target_audience', 'Not Found')
        results[idx]['market'] = company_features.get('market', 'Not Found')

        if idx % 5 == 0:
            with open('private_companies_feature_engineering.json', 'w') as f:
                json.dump(results, f, ensure_ascii=False)

Fetching company features
Fetching company features


 88%|████████▊ | 23/26 [00:11<00:01,  2.46it/s]

In [None]:
# Initialize an empty list to store DataFrames
dfs = []

# Loop through the results
for idx, result in enumerate(results):
    # Create a DataFrame for the current result
    tdf = pd.DataFrame({
        'Company Name': [result['name']],
        'Company Headquarter': [result['hq']],
        'Company Value Proposition': [result['value_proposition']],
        'Company Industry': [result['industry']],
        'Company Vertical': [result['vertical']],
        'Company Target Audience': [result['target_audience']],
        'Company Market': [result['market']]
    })
    
    # Append the DataFrame to the list
    dfs.append(tdf)

# Concatenate all DataFrames in the list along the rows
inference_df = pd.concat(dfs, ignore_index=True)

# Display the DataFrame
display(inference_df)


Unnamed: 0,Company Name,Company Headquarter,Company Value Proposition,Company Industry,Company Vertical,Company Target Audience,Company Market
0,Voyantis,Israel,Voyantis helps marketing teams increase ad con...,Software,SaaS,marketing teams,B2B
1,Rapyd Financial Network (2016) Ltd.,Israel,Rapyd Financial Network helps businesses strea...,Finance,FinTech,c-levels,B2B
2,Utopia Music AG,Switzerland,"Utopia Music helps artists, labels, and music ...",Entertainment,SaaS,c-levels,B2B
3,Pragmatic Play Ltd.,Malta,Pragmatic Play Ltd. helps online gaming platfo...,Entertainment,OTHER,consumers,B2B
4,K Health Inc.,USA,K Health helps individuals access high-quality...,Health Care,HealthTech,consumers,B2C
5,"FireArc, Ltd.",Israel,FireArc helps digital properties enhance user ...,Advertisements,OTHER,marketing teams,B2B
6,"5.11, Inc.",USA,"5.11, Inc. helps professionals and outdoor ent...",Consumer Goods,Ecommerce,consumers,B2C
7,Ahava-Dead Sea Laboratories Ltd.,Israel,Ahava-Dead Sea Laboratories Ltd. helps consume...,Consumer Goods,Ecommerce,consumers,B2C
8,Amagi Media Labs Private Limited,India,Amagi Media Labs Private Limited helps broadca...,Media & Communications,SaaS,c-levels,B2B
9,Miya Water Spain SL,Spain,Miya Water Spain SL helps governments and busi...,Other,OTHER,c-levels,B2B


In [None]:
from fetch_comparables import find_similar_companies


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Initialize an empty DataFrame to store the results
all_competitors_df = pd.DataFrame()

for idx in range(len(inference_df)):
    # Extract relevant information for the current company
    company_info = {
        'Company Name': inference_df.loc[idx, 'Company Name'],
        'Company Headquarter': inference_df.loc[idx, 'Company Headquarter'],
        'Company Value Proposition': inference_df.loc[idx, 'Company Value Proposition'],
        'Company Industry': inference_df.loc[idx, 'Company Industry'],
        'Company Vertical': inference_df.loc[idx, 'Company Vertical'],
        'Company Target Audience': inference_df.loc[idx, 'Company Target Audience'],
        'Company Market': inference_df.loc[idx, 'Company Market']
        }
    
    # Create a DataFrame for the current company
    company_df = pd.DataFrame(company_info, index=[0])
    
    # Find similar companies and concatenate the results
    similar_companies_df = find_similar_companies(df, company_df, region_mapping,
                                                  vertical_filter=True,
                                                  industry_filter=False,
                                                  region_filter=True)
    all_competitors_df = pd.concat([all_competitors_df, similar_companies_df], ignore_index=True)

# After the loop, all_competitors_df will contain the results for all companies


2024-04-16 23:47:52,378 - INFO - Load pretrained SentenceTransformer: bert-base-uncased
2024-04-16 23:47:54,246 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.53it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
2024-04-16 23:47:56,663 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


SaaS
Israel


2024-04-16 23:47:58,423 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.95it/s]
100%|██████████| 1/1 [00:00<00:00, 13.74it/s]
2024-04-16 23:48:00,524 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


FinTech
Israel


2024-04-16 23:48:01,442 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.54it/s]
100%|██████████| 1/1 [00:00<00:00, 11.61it/s]
2024-04-16 23:48:03,442 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


SaaS
Switzerland


2024-04-16 23:48:04,335 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.06it/s]
100%|██████████| 1/1 [00:00<00:00, 14.08it/s]
2024-04-16 23:48:06,504 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


OTHER
Malta


2024-04-16 23:48:07,425 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.87it/s]
100%|██████████| 1/1 [00:00<00:00, 14.94it/s]
2024-04-16 23:48:09,589 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


HealthTech
USA


2024-04-16 23:48:10,601 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.24it/s]
100%|██████████| 1/1 [00:00<00:00, 12.24it/s]
2024-04-16 23:48:12,738 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


OTHER
Israel


2024-04-16 23:48:13,771 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.52it/s]
100%|██████████| 1/1 [00:00<00:00, 14.65it/s]
2024-04-16 23:48:15,968 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


Ecommerce
USA


2024-04-16 23:48:17,050 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.96it/s]
100%|██████████| 1/1 [00:00<00:00, 14.03it/s]
2024-04-16 23:48:19,093 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


Ecommerce
Israel


2024-04-16 23:48:20,257 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.47it/s]
100%|██████████| 1/1 [00:00<00:00, 15.24it/s]
2024-04-16 23:48:22,376 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


SaaS
India


2024-04-16 23:48:23,554 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.49it/s]
100%|██████████| 1/1 [00:00<00:00, 13.57it/s]
2024-04-16 23:48:25,713 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


OTHER
Spain


2024-04-16 23:48:26,674 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.23it/s]
100%|██████████| 1/1 [00:00<00:00, 14.23it/s]
2024-04-16 23:48:28,615 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


Ecommerce
United Kingdom


2024-04-16 23:48:29,712 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.67it/s]
100%|██████████| 1/1 [00:00<00:00, 13.76it/s]
2024-04-16 23:48:31,683 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


FinTech
United Kingdom


2024-04-16 23:48:32,613 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.84it/s]
100%|██████████| 1/1 [00:00<00:00, 13.76it/s]
2024-04-16 23:48:34,583 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


HealthTech
Australia


2024-04-16 23:48:35,479 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.65it/s]
100%|██████████| 1/1 [00:00<00:00, 13.65it/s]
2024-04-16 23:48:37,583 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


Ecommerce
USA


2024-04-16 23:48:39,072 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.22it/s]
100%|██████████| 1/1 [00:00<00:00, 15.30it/s]
2024-04-16 23:48:41,002 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


OTHER
Israel


2024-04-16 23:48:42,035 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.62it/s]
100%|██████████| 1/1 [00:00<00:00, 17.23it/s]
2024-04-16 23:48:44,010 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


Ecommerce
USA


2024-04-16 23:48:45,223 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.30it/s]
100%|██████████| 1/1 [00:00<00:00, 17.06it/s]
2024-04-16 23:48:47,315 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


Ecommerce
USA


2024-04-16 23:48:48,282 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.77it/s]
100%|██████████| 1/1 [00:00<00:00, 16.62it/s]
2024-04-16 23:48:50,306 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


SaaS
Denmark


2024-04-16 23:48:51,351 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.41it/s]
100%|██████████| 1/1 [00:00<00:00, 12.54it/s]
2024-04-16 23:48:53,412 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


FinTech
Australia


2024-04-16 23:48:54,525 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.02it/s]
100%|██████████| 1/1 [00:00<00:00, 14.75it/s]
2024-04-16 23:48:56,539 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


TMT
Israel


2024-04-16 23:48:58,127 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.65it/s]
100%|██████████| 1/1 [00:00<00:00, 13.59it/s]
2024-04-16 23:49:00,266 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


Ecommerce
India


2024-04-16 23:49:01,300 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.69it/s]
100%|██████████| 1/1 [00:00<00:00, 13.75it/s]
2024-04-16 23:49:03,267 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


Ecommerce
Spain


2024-04-16 23:49:04,368 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.25it/s]
100%|██████████| 1/1 [00:00<00:00, 14.30it/s]
2024-04-16 23:49:06,274 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


OTHER
Israel


2024-04-16 23:49:07,238 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.27it/s]
100%|██████████| 1/1 [00:00<00:00, 15.25it/s]
2024-04-16 23:49:09,139 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


FinTech
USA


2024-04-16 23:49:10,401 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.46it/s]
100%|██████████| 1/1 [00:00<00:00,  8.74it/s]
2024-04-16 23:49:12,345 - INFO - Load pretrained SentenceTransformer: bert-base-uncased


OTHER
Israel


2024-04-16 23:49:13,280 - INFO - Use pytorch device_name: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.55it/s]
100%|██████████| 1/1 [00:00<00:00, 13.62it/s]


DeepTech
Israel


In [None]:
all_competitors_df.to_csv('competitors_valueprop_vertical_v2.csv')