In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import re
import datetime
from sklearn.model_selection import train_test_split
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers

In [4]:
deals_df = pd.read_csv('PE_Dataset.csv')
deals_df.head()

Unnamed: 0,Name,Unnamed: 1,Fiscal Year,Pitchbook Link,Company Name,Year Founded,Employee Growth,Current Employees,Keywords,Verticals,All Industries,Lead/Sole Investors,# Follow-on Investors,Financing Status,Business Status,Deal Date,Total Invested Equity,Raised to Date,Deal Class,Deal Type,VC Round,Deal Type 2,Deal Size,Series,Deal No.,VC Round Up/Down/Flat,Current Financing Status,Financing Status Note,Current Business Status,Description,Investors,Deal Synopsis,Deal Date.1
0,Kroo,,,https://pitchbook.com/profiles/company/221945-05,Kroo,2016.0,"2021: 50, 2020: 47",80.0,"cash payment software, e-money account, finan...","FinTech, Mobile, Mobile Commerce","Application Software, Financial Software*, Oth...",,,Venture Capital-Backed,Generating Revenue,31/12/2023,,,Venture Capital,Later Stage VC,6,Series C,,Series C,6,,Venture Capital-Backed,The company is reportedly seeking Series C ve...,Generating Revenue,Developer of a payment mobile application des...,,"The company raised GBP 184,000 of seed fundin...",
1,Swoop (Financial Software),,,https://pitchbook.com/profiles/company/231457-06,Swoop (Financial Software),2018.0,"2021: 68, 2020: 58",80.0,"business financing, business funding, business...","FinTech, Mobile, SaaS, TMT","Financial Software*, Specialized Finance",,2.0,Venture Capital-Backed,Generating Revenue,13/06/2022,6.74,8.78,Venture Capital,Later Stage VC,4,Series A,6.74,Series A,8,,Venture Capital-Backed,The company raised GBP 5.4 million of Series ...,Generating Revenue,Developer of one-stop money shop application ...,"Arab Bank (AMM: ARBK), Enterprise Ireland, IAG...","The company raised GBP 97,848 of angel fundin...",
2,Fiat Republic,,,https://pitchbook.com/profiles/company/484063-30,Fiat Republic,2021.0,84.2125,25.0,"banking api tool, crypto platform, crypto sof...","Cryptocurrency/Blockchain, FinTech, SaaS","Financial Software*, Other Financial Services",Credo Ventures,1.0,Venture Capital-Backed,Generating Revenue,08/06/2022,6.0,6.0,Venture Capital,Seed Round,1,Seed Round,6.0,,2,,Venture Capital-Backed,The company raised $6 million of seed funding...,Generating Revenue,"Developer of crypto-focused, e-money-regulate...","Breega, Connect Ventures, Credo Ventures, Emer...","The company raised GBP 79,410 of angel fundin...",
3,Backd,,,https://pitchbook.com/profiles/company/498267-82,Backd,,,,"crypto exchange, defi system, defi wallets, li...","Cryptocurrency/Blockchain, FinTech","Financial Software*, Other Financial Services",Advanced Blockchain (DUS: BWQ) (Simon Telian),,Venture Capital-Backed,Startup,07/06/2022,3.5,3.5,Venture Capital,Early Stage VC,1,Series A,3.5,Series A,1,,Venture Capital-Backed,The company raised $3.5 million of Series A v...,Startup,Developer of a crypto-based consumer lending ...,"Advanced Blockchain (DUS: BWQ) (Simon Telian),...",The company raised GBP 5.56 million of equity...,
4,Bud,,,https://pitchbook.com/profiles/company/169783-84,Bud,2015.0,"2021: 166, 2020: 76, 2019: 70, 2018: 50, 2017: 23",167.0,"financial software, open banking, open bankin...","FinTech, Mobile","Application Software, Financial Software*, Oth...",TDR Capital (Gary Lindsay),1.0,Venture Capital-Backed,Generating Revenue,07/06/2022,80.0,112.88,Venture Capital,Later Stage VC,6,Series B,80.0,Series B,8,,Venture Capital-Backed,The company raised $80 million of Series B ve...,Generating Revenue,Developer of an open banking application desi...,"Outward Venture Capital, SEI Investments (NAS:...","The company raised GBP 77,057 of angel fundin...",


## 1. Data Preprocessing

### 1.1 Define functions required for Data Preprocessing and Cleaning

In [5]:
# Assign single (mean) value for employee growth across years
def update_emp_growth(df):
    mean_emp_growth = []
    for idx, value in enumerate(df["Employee Growth"]):
        if type(value) is str:
            value = value.split(',')
            if value[0].find(":") != -1:
                value = [float(val[val.find(':') + 2:]) for val in value]
            else:
                value[0] = float(value[0])
            mean_emp_growth.append(round(sum(value)/len(value)))
        else:
            mean_emp_growth.append(value)
    return mean_emp_growth

# Change data type from str to float excluding nan values
def change_type(df, column):
    new_val = []
    for idx, val in enumerate(df[column]):
        if type(val) is str:
            val = val.replace(',', '')
            val = float(val)
        new_val.append(val)   
    return new_val

In [6]:
# If deal date is not specified, get deal date from synopsis
def update_deal_date(df):
    new_date = []
    months = {"jan" :1, "january":1, "feb":2, "february":2, "mar":3, "march":3, "apr":4, "april":4, "may":5, "jun":6, "june":6, 
            "jul":7, "july":7, "aug":8, "august":8, "sept":9, "september":9, "oct":10, "october":10, 
            "nov":11, "november":11, "dec":12, "december":12}
    for idx, val in enumerate(df["Deal Date"]):
        if type(val) is not pd._libs.tslibs.timestamps.Timestamp:
            synopsis = df.iloc[idx]["Deal Synopsis"].split()
            day = []
            if len(list(filter(lambda a: (a.lower() in  months.keys()), synopsis))) == 0:
                new_date.append(val)
                continue
            day = 1
            month = ""
            year = df.iloc[idx]["Fiscal Year"] if not pd.isnull(df.iloc[idx]["Fiscal Year"]) else 2022
            for index, word in enumerate(synopsis):
                word=word.replace(",", "")
                if word.lower() in months.keys():
                    month = word
                    year_candidates = synopsis[max(0, index-2): index+3]
                    for word in year_candidates:
                        word = word.replace(",", "")
                        word = word.replace(".", "")
                        if len(word) > 4:
                            continue
                        try:
                            int_word = int(word)
                            if int_word > 31 or int_word < 1:
                                if len(word) == 2:
                                    year = int("20"+word)
                                elif len(word) == 4:
                                    year = int(word)
                                else:
                                    pass
                                continue
                            else:
                                day = int_word 
                        except ValueError:
                            continue
            new_date.append(datetime.date(year, months[month.lower()], day))
        else:
            new_date.append(val)
    return new_date

### 1.2 Data Preprocesing and Cleaning

In [7]:
def preprocess(df):
    df = df.copy()
    # Drop irrelevant, duplicate, high cardinality, single value columns 
    df = df.drop(['Unnamed: 1', "Pitchbook Link", "Deal Date.1", "Year Founded", 'Financing Status', 'Company Name', 'Investors', 'Series'], axis = 1)
    df.loc[pd.isnull(df['Deal Synopsis']), 'Deal Synopsis'] = "Unavailable"
    df["Deal Date"] = pd.to_datetime(df["Deal Date"], errors='coerce', infer_datetime_format=True)
    df["Deal Date"] = update_deal_date(df)
    df.rename(columns={'Employee Growth ': 'Employee Growth'}, inplace=True)

    df = df.loc[~df['Deal Type'].isin(['Product Crowdfunding', 'Equity Crowdfunding'])]

    df.loc[pd.isnull(df['Deal Date']), 'Deal Date'] = df.mode()['Deal Date'][0]  # For deal dates that are undisclosed in the synopsis
    #df.loc[pd.isnull(df['Year Founded']), 'Year Founded'] = df.mode()['Deal Date'][0]
    df.loc[pd.isnull(df['Fiscal Year']), 'Fiscal Year'] = df['Deal Date'].dt.year
    df['Deal Date']  = df["Deal Date"].dt.date

    df['Employee Growth'] = update_emp_growth(df)
    df.loc[pd.isnull(df['Employee Growth']), 'Employee Growth']  = round(df[~df['Employee Growth'].isnull()]['Employee Growth'].mean(), 2)
    df['Current Employees'] = change_type(df, "Current Employees")
    df.loc[pd.isnull(df['Current Employees']), 'Current Employees'] = round(df[~df['Current Employees'].isnull()]['Current Employees'].mean(), 0)
    df['Total Invested Equity'] = change_type(df, "Total Invested Equity")
    df.loc[pd.isnull(df['Total Invested Equity']), 'Total Invested Equity'] = round(df[~df['Total Invested Equity'].isnull()]['Total Invested Equity'].mean(), 2)
    df['Deal Size'] = change_type(df, 'Deal Size')
    df.loc[pd.isnull(df['Deal Size']), 'Deal Size'] = round(df[~df['Deal Size'].isnull()]['Deal Size'].mean(), 2)
    df['Raised to Date'] = change_type(df, 'Raised to Date')
    df.loc[pd.isnull(df['Raised to Date']), 'Raised to Date'] = round(df[~df['Raised to Date'].isnull()]['Total Invested Equity'].mean(), 2)

    df.loc[pd.isnull(df['# Follow-on Investors']), '# Follow-on Investors'] = 0
    df.loc[pd.isnull(df['VC Round Up/Down/Flat']), 'VC Round Up/Down/Flat'] = 0
    df.loc[pd.isnull(df['VC Round']), 'VC Round'] = -1
    df.loc[pd.isnull(df['Lead/Sole Investors']), 'Lead/Sole Investors'] = "None"
    df.loc[pd.isnull(df['Deal Type 2']), 'Deal Type 2'] = "NA"

    df["VC Round Up/Down/Flat"] = df["VC Round Up/Down/Flat"].replace({"Up Round": 1, "Down Round": -1, "Flat Round": 2})
    df["VC Round"] = df["VC Round"].replace({"Angel": 0})
    df["VC Round"] = df["VC Round"].astype(int)

    return df 

In [8]:
clean_df = preprocess(deals_df)

In [9]:
clean_df.head()

Unnamed: 0,Name,Fiscal Year,Employee Growth,Current Employees,Keywords,Verticals,All Industries,Lead/Sole Investors,# Follow-on Investors,Business Status,Deal Date,Total Invested Equity,Raised to Date,Deal Class,Deal Type,VC Round,Deal Type 2,Deal Size,Deal No.,VC Round Up/Down/Flat,Current Financing Status,Financing Status Note,Current Business Status,Description,Deal Synopsis
0,Kroo,2023.0,48.0,80.0,"cash payment software, e-money account, finan...","FinTech, Mobile, Mobile Commerce","Application Software, Financial Software*, Oth...",,0.0,Generating Revenue,2023-12-31,13.68,13.68,Venture Capital,Later Stage VC,6,Series C,15.34,6,0,Venture Capital-Backed,The company is reportedly seeking Series C ve...,Generating Revenue,Developer of a payment mobile application des...,"The company raised GBP 184,000 of seed fundin..."
1,Swoop (Financial Software),2022.0,63.0,80.0,"business financing, business funding, business...","FinTech, Mobile, SaaS, TMT","Financial Software*, Specialized Finance",,2.0,Generating Revenue,2022-06-13,6.74,8.78,Venture Capital,Later Stage VC,4,Series A,6.74,8,0,Venture Capital-Backed,The company raised GBP 5.4 million of Series ...,Generating Revenue,Developer of one-stop money shop application ...,"The company raised GBP 97,848 of angel fundin..."
2,Fiat Republic,2022.0,84.0,25.0,"banking api tool, crypto platform, crypto sof...","Cryptocurrency/Blockchain, FinTech, SaaS","Financial Software*, Other Financial Services",Credo Ventures,1.0,Generating Revenue,2022-06-08,6.0,6.0,Venture Capital,Seed Round,1,Seed Round,6.0,2,0,Venture Capital-Backed,The company raised $6 million of seed funding...,Generating Revenue,"Developer of crypto-focused, e-money-regulate...","The company raised GBP 79,410 of angel fundin..."
3,Backd,2022.0,78.29,127.0,"crypto exchange, defi system, defi wallets, li...","Cryptocurrency/Blockchain, FinTech","Financial Software*, Other Financial Services",Advanced Blockchain (DUS: BWQ) (Simon Telian),0.0,Startup,2022-06-07,3.5,3.5,Venture Capital,Early Stage VC,1,Series A,3.5,1,0,Venture Capital-Backed,The company raised $3.5 million of Series A v...,Startup,Developer of a crypto-based consumer lending ...,The company raised GBP 5.56 million of equity...
4,Bud,2022.0,77.0,167.0,"financial software, open banking, open bankin...","FinTech, Mobile","Application Software, Financial Software*, Oth...",TDR Capital (Gary Lindsay),1.0,Generating Revenue,2022-06-07,80.0,112.88,Venture Capital,Later Stage VC,6,Series B,80.0,8,0,Venture Capital-Backed,The company raised $80 million of Series B ve...,Generating Revenue,Developer of an open banking application desi...,"The company raised GBP 77,057 of angel fundin..."


In [10]:
clean_df.info()  # No null values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2440 entries, 0 to 2546
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Name                      2440 non-null   object 
 1   Fiscal Year               2440 non-null   float64
 2   Employee Growth           2440 non-null   float64
 3   Current Employees         2440 non-null   float64
 4   Keywords                  2440 non-null   object 
 5   Verticals                 2440 non-null   object 
 6   All Industries            2440 non-null   object 
 7   Lead/Sole Investors       2440 non-null   object 
 8   # Follow-on Investors     2440 non-null   float64
 9   Business Status           2440 non-null   object 
 10  Deal Date                 2440 non-null   object 
 11  Total Invested Equity     2440 non-null   float64
 12  Raised to Date            2440 non-null   float64
 13  Deal Class                2440 non-null   object 
 14  Deal Typ

### 1.3 Flatten Data by Company (To be used for generating time series and text embeddings)

In [25]:
clean_df = clean_df.sort_values(by=['Name', 'Fiscal Year'], ascending=False)

In [26]:
company = {}
for name in clean_df['Name']:
    if name in company.keys():
        company[name] += 1
    else:
        company[name] = 1

In [27]:
emp_growth, comp_name, curr_emp, keywords, verticals, industries = [], [], [], [], [], []
bstatus, fstatus, curr_bstatus, FSNote, desc, raised, last_deal_no = [], [], [], [], [], [], []
lead_investor, fo_investor = [], []
dd_t3, dd_t2, dd_t1, dd_next, target, synopsis_t3, synopsis_t2, synopsis_t1 = [], [], [], [], [], [], [], []
prev_equity, avg_equity = [], []
class_t1, class_t2, class_t3 = [], [], []
type_t1, type_t2, type_t3 = [], [], []
type2_t1, type2_t2, type2_t3 = [], [], []
vcround_avg, size_avg, prev_vc = [], [], []

for name, num_deals in company.items():
    df = clean_df[clean_df.Name == name]
    df = df.reset_index(drop=True)
    comp_name.append(name)
    emp_growth.append(df['Employee Growth'][0])
    curr_emp.append(df['Current Employees'][0])
    keywords.append(df['Keywords'][0])
    verticals.append(df['Verticals'][0])
    industries.append(df['All Industries'][0])
    bstatus.append(df['Business Status'][0])
    fstatus.append(df['Current Financing Status'][0])
    desc.append(df['Description'][0])
    FSNote.append(df['Financing Status Note'][0])
    raised.append(df['Raised to Date'][0])
    curr_bstatus.append(['Current Bussiness Status'][0])
    dd_next.append(df['Deal Date'].iloc[0])
    if df['Deal Date'].iloc[0].year == 2022 and num_deals > 1:
        target.append(1)
        df = df.iloc[1:]
        df = df.reset_index(drop=True)
    else:
        target.append(0)  
        
    last_deal_no.append(df['Deal No.'][0])
    lead_investor.append(', '.join([investor for investor in df['Lead/Sole Investors']]))
    fo_investor.append(df['# Follow-on Investors'].mean())
    vcround_avg.append(round(df['VC Round'].mean(), 2))
    size_avg.append(round(df['Deal Size'].mean(), 2))
    avg_equity.append(round(df['Total Invested Equity'].mean(), 2))
    if num_deals == 1:
        prev_vc.append(0)
        prev_equity.append(0)
        class_t2.append("None") 
        class_t3.append("None") 
        type_t2.append("None") 
        type_t3.append("None") 
        type2_t2.append("None")
        type2_t3.append("None") 
        synopsis_t2.append("None")
        synopsis_t3.append("None")
        dd_t2.append("None")
        dd_t3.append("None")
    else:
        prev_vc.append(df["VC Round Up/Down/Flat"][0])
        prev_equity.append(df["Total Invested Equity"][0])
    if num_deals == 2 or (num_deals == 3 and df.shape[0] == 2):
        class_t3.append("None")
        type_t3.append("None")
        type2_t3.append("None")
        synopsis_t3.append("None")
        dd_t3.append("None")
        if df.shape[0] == 1:
            class_t2.append("None")
            type_t2.append("None")
            type2_t2.append("None")
            synopsis_t2.append("None")
            dd_t2.append("None")
    
    for idx, row in df.iterrows():
        if idx > 2:
            break
        elif idx == 0:
            class_t1.append(row["Deal Class"])
            type_t1.append(row["Deal Type"])
            type2_t1.append(row["Deal Type 2"])
            synopsis_t1.append(row["Deal Synopsis"])
            dd_t1.append(row["DateVector"])
        elif idx == 1:
            class_t2.append(row["Deal Class"])
            type_t2.append(row["Deal Type"])
            type2_t2.append(row["Deal Type 2"])
            synopsis_t2.append(row["Deal Synopsis"])
            dd_t2.append(row["DateVector"])
        else:
            class_t3.append(row["Deal Class"])
            type_t3.append(row["Deal Type"])
            type2_t3.append(row["Deal Type 2"])
            synopsis_t3.append(row["Deal Synopsis"])
            dd_t3.append(row["DateVector"])

In [28]:
flat_dict = {"Company": comp_name, "EmployeeGrowth": emp_growth, "CurrentEmployees": curr_emp,
             "Keywords": keywords, "Verticals": verticals, "Industries": industries, 
             "BusinessStatus": bstatus, "CurrentBusinessStatus": curr_bstatus, "CurrentFinancingStatus": fstatus,
             "TotalRaised": raised, "LastDealNo": last_deal_no, "PrevVC(Up/Down)": prev_vc, "AvgDealSize": size_avg,
             "AvgEquity": avg_equity, "PrevEquity": prev_equity,
             "AvgVCRound": vcround_avg, "LeadInvestor": lead_investor, "NumFollowOnInvestors": fo_investor,
             "DealDate_t3": dd_t3, "Class_t3": class_t3, "Type_t3": type_t3, "Type2_t3": type2_t3, "Synopsis_t3": synopsis_t3,
             "DealDate_t2": dd_t2, "Class_t2": class_t2, "Type_t2": type_t2, "Type2_t2": type2_t2, "Synopsis_t2": synopsis_t2,
             "DealDate_t1": dd_t1, "Class_t1": class_t1, "Type_t1": type_t1, "Type2_t1": type2_t1, "Synopsis_t1": synopsis_t1,
             "Description": desc, "FinStatusNote": FSNote, "NextDealDate": dd_next, "isDeal": target}

In [29]:
flat_df = pd.DataFrame(flat_dict)
flat_df.head(20)

Unnamed: 0,Company,EmployeeGrowth,CurrentEmployees,Keywords,Verticals,Industries,BusinessStatus,CurrentBusinessStatus,CurrentFinancingStatus,TotalRaised,LastDealNo,PrevVC(Up/Down),AvgDealSize,AvgEquity,PrevEquity,AvgVCRound,LeadInvestor,NumFollowOnInvestors,DealDate_t3,Class_t3,Type_t3,Type2_t3,Synopsis_t3,DealDate_t2,Class_t2,Type_t2,Type2_t2,Synopsis_t2,DealDate_t1,Class_t1,Type_t1,Type2_t1,Synopsis_t1,Description,FinStatusNote,NextDealDate,isDeal
0,vabble (Financial Software),78.29,3.0,"financial platform, funding options, instituti...",FinTech,"Financial Software*, Other Capital Markets/Ins...",Generating Revenue,Current Bussiness Status,Venture Capital-Backed,13.68,1,0,15.34,13.68,0.0,1.0,Fuel Ventures,0.0,,,,,,,,,,,"[0.0, 0.0, 0.0, 0.09221074, 0.011161167, 0.002...",Venture Capital,Seed Round,Seed Round,"The company raised GBP 947,791 of venture fun...",Developer of a financial platform intended to...,The company raised seed funding from Fuel Vent...,2021-01-01,0
1,trustshare,84.0,16.0,"financial services, financial services app, f...","B2B Payments, FinTech, SaaS","Business/Productivity Software, Financial Soft...",Generating Revenue,Current Bussiness Status,Venture Capital-Backed,4.16,2,1,3.2,3.2,3.2,1.0,Nauta Capital (Carles Roqueta),0.0,,,,,,,,,,,"[0.0, 0.038540773, 0.09711099, 0.22219688, 0.0...",Venture Capital,Seed Round,Seed Round,The company raised venture funding from Loyal ...,Developer of an online platform intended for ...,The company raised an undisclosed amount of v...,2022-01-04,1
2,swIDch,9.0,9.0,"authentication technology, digital payment ca...","FinTech, SaaS","Business/Productivity Software, Network Manage...",Generating Revenue,Current Bussiness Status,Venture Capital-Backed,13.68,2,0,15.34,13.68,0.0,1.0,BKT Capital,0.0,,,,,,,,,,,"[0.13642836, 0.100364886, 0.0, 0.04559175, 0.0...",Venture Capital,Early Stage VC,,The company raised EUR 7.5 million of Series ...,Developer of authentication technology intend...,The company raised venture funding from BKT Ca...,2021-01-01,0
3,sqft.capital,78.29,11.0,"data insights marketplace, development financ...",FinTech,"Application Software, Business/Productivity So...",Generating Revenue,Current Bussiness Status,Venture Capital-Backed,0.93,2,0,0.76,0.76,0.0,1.0,RO Capital Partners (Edward Rowlandson),1.0,,,,,,,,,,,"[0.0, 0.5521339, 0.0, 0.0, 0.089344405, 0.0, 0...",Venture Capital,Later Stage VC,,The company raised GBP 8.39 million of ventur...,Developer of a funding and data insights mark...,"The company raised GBP 600,000 of venture fun...",2022-10-05,0
4,so-sure,12.0,21.0,"contents insurance, insurance application, ins...","FinTech, InsurTech, Mobile, SaaS, TMT","Financial Software, Multi-line Insurance*",Generating Revenue,Current Bussiness Status,Venture Capital-Backed,4.53,4,1,8.8,7.97,3.7,-0.25,"Breega (Benoit Marrel), AFL Incubator (George ...",0.0,"[0.0, 0.24082406, 0.0, 0.0, 0.027381714, 0.044...",Other,Accelerator/Incubator,,"The company raised GBP 949,701 of venture fun...","[0.08290465, 0.0404652, 0.0, 0.05820749, 0.076...",Other,Accelerator/Incubator,,"The company raised GBP 350,000 of seed funding...","[0.0, 0.12277673, 0.0, 0.41839737, 0.09081799,...",Venture Capital,Seed Round,Seed Round,The company raised GBP 1.81 million of seed f...,Developer of an online insurance platform des...,The company raised GBP 2.83 million of seed f...,2018-04-10,0
5,pirkx,14.0,12.0,"employee benefit, employee engagement, employ...","FinTech, SaaS","Other Commercial Services, Other Financial Ser...",Generating Revenue,Current Bussiness Status,Venture Capital-Backed,2.29,1,0,1.61,1.61,1.61,1.0,,0.0,,,,,,,,,,,"[0.0, 0.03222477, 0.2686807, 0.013945032, 0.0,...",Venture Capital,Seed Round,Seed Round,The company raised GBP 10.7 million of Series...,Developer of a subscription-based platform de...,"The company raised GBP 523,918 of venture fun...",2022-05-04,1
6,pawaPay,84.0,41.0,"business payment software, business payment s...","FinTech, Mobile","Financial Software*, Other Financial Services",Generating Revenue,Current Bussiness Status,Venture Capital-Backed,9.0,2,0,12.17,11.34,9.0,1.5,"88mph (Kresten Buch), MSA Capital, Kepple Afri...",0.5,,,,,,"[0.0, 0.20842521, 0.0, 0.0, 0.1917238, 0.0, 0....",Venture Capital,Early Stage VC,,"The company raised GBP 300,000 of venture fun...","[0.0, 0.45299888, 0.0, 0.0, 0.06377394, 0.0, 0...",Venture Capital,Seed Round,Seed Round,The company raised GBP 29.85 million of ventu...,Developer of an online payment platform inten...,The company raised $9 million of seed funding...,2021-08-26,0
7,mnAI,84.0,12.0,"business insights, data collation, data collec...","Big Data, FinTech, Marketing Tech","Business/Productivity Software, Financial Soft...",Generating Revenue,Current Bussiness Status,Venture Capital-Backed,13.68,1,0,15.34,13.68,0.0,1.0,Ayanda Capital,0.0,,,,,,,,,,,"[0.10927907, 0.035472877, 0.0, 0.06200006, 0.0...",Venture Capital,Early Stage VC,,The company joined Y Combinator as part of Wi...,Developer of a market insight platform design...,The company raised venture funding from Ayanda...,2021-01-01,0
8,loyalBe,4.0,5.0,"api, daas, data and intelligent platform, dat...","FinTech, Mobile, Mobile Commerce","Business/Productivity Software*, Financial Sof...",Product In Beta Test,Current Bussiness Status,Venture Capital-Backed,0.65,5,1,0.32,0.14,0.15,1.5,"Techstart Ventures, Techstart Ventures (Ryan M...",0.5,,,,,,"[0.17092048, 0.059997894, 0.0, 0.16716602, 0.0...",Venture Capital,Seed Round,Seed Round,"The company raised GBP 121,808 of venture fun...","[0.0, 0.055572875, 0.03404636, 0.10929588, 0.0...",Venture Capital,Seed Round,Seed Round,"The company raised GBP 88,482 of seed funding...",Operator of a digital loyalty platform design...,"The company raised $500,000 of seed funding t...",2021-03-31,0
9,iwoca,250.0,312.0,"business credit, business lending service, cr...","FinTech, TMT","Financial Software, Other Financial Services, ...",Profitable,Current Bussiness Status,Venture Capital-Backed,478.13,10,0,44.78,15.87,13.68,4.0,"None, None, None, Prime Ventures (Sake Bosch),...",1.571429,"[0.15930897, 0.010772042, 0.0, 0.15698272, 0.1...",Venture Capital,Later Stage VC,,The company raised an undisclosed amount of se...,"[0.3164297, 1.3873713, 0.069218874, 0.52956444...",Venture Capital,Later Stage VC,Series D,The company joined Techstars Toronto Accelera...,"[0.17803128, 0.2664707, 0.0, 0.08360991, 0.0, ...",Venture Capital,Later Stage VC,,The company raised GBP 4.83 million of seed f...,Operator of a credit finance platform designe...,The company raised GBP 10 million of venture ...,2020-04-08,0


## 2. Generate Feature Embeddings

### Time Series Features

In [21]:
ts_features = ['Deal Type', 'Deal Type 2', 'Deal Class', 'Deal Date']   
ts_df = clean_df[ts_features] 
ts_df

Unnamed: 0,Deal Type,Deal Type 2,Deal Class,Deal Date
0,Later Stage VC,Series C,Venture Capital,2023-12-31
1,Later Stage VC,Series A,Venture Capital,2022-06-13
2,Seed Round,Seed Round,Venture Capital,2022-06-08
3,Early Stage VC,Series A,Venture Capital,2022-06-07
4,Later Stage VC,Series B,Venture Capital,2022-06-07
...,...,...,...,...
2542,Seed Round,Seed Round,Venture Capital,2022-04-21
2543,Early Stage VC,,Venture Capital,2021-01-01
2544,Early Stage VC,,Venture Capital,2018-03-29
2545,Early Stage VC,,Venture Capital,2018-11-26


In [22]:
ts_df.set_index('Deal Date', inplace=True)
ts_df

Unnamed: 0_level_0,Deal Type,Deal Type 2,Deal Class
Deal Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-12-31,Later Stage VC,Series C,Venture Capital
2022-06-13,Later Stage VC,Series A,Venture Capital
2022-06-08,Seed Round,Seed Round,Venture Capital
2022-06-07,Early Stage VC,Series A,Venture Capital
2022-06-07,Later Stage VC,Series B,Venture Capital
...,...,...,...
2022-04-21,Seed Round,Seed Round,Venture Capital
2021-01-01,Early Stage VC,,Venture Capital
2018-03-29,Early Stage VC,,Venture Capital
2018-11-26,Early Stage VC,,Venture Capital


In [15]:
def onehot_encode_pd(df, col_name):
    dummies = pd.get_dummies(df[col_name], prefix=col_name)
    return pd.concat([df, dummies], axis=1).drop(columns=[col_name])

ts_df = onehot_encode_pd(ts_df, 'Deal Class')
ts_df = onehot_encode_pd(ts_df, 'Deal Type')
ts_df = onehot_encode_pd(ts_df, 'Deal Type 2')
ts_df

Unnamed: 0_level_0,Deal Class_Corporate,Deal Class_Hedge Fund,Deal Class_Individual,Deal Class_Other,Deal Class_Venture Capital,Deal Type_Accelerator/Incubator,Deal Type_Angel (individual),Deal Type_Early Stage VC,Deal Type_Later Stage VC,Deal Type_Seed Round,Deal Type 2_Angel (individual),Deal Type 2_NA,Deal Type 2_Seed Round,Deal Type 2_Series 1,Deal Type 2_Series 2,Deal Type 2_Series 3,Deal Type 2_Series A,Deal Type 2_Series A1,Deal Type 2_Series A2,Deal Type 2_Series A3,Deal Type 2_Series B,Deal Type 2_Series B1,Deal Type 2_Series B2,Deal Type 2_Series C,Deal Type 2_Series C1,Deal Type 2_Series C2,Deal Type 2_Series C3,Deal Type 2_Series D,Deal Type 2_Series E,Deal Type 2_Series F,Deal Type 2_Series G,Deal Type 2_Series H
Deal Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
2023-12-31,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2022-06-13,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-08-06,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-07-06,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2022-07-06,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-01,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2021-01-01,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2021-01-01,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2021-01-01,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
training_mean = ts_df.mean()
training_std = ts_df.std()
df_training_value = (ts_df - training_mean) / training_std
len(df_training_value)

2440

In [17]:
TIME_STEPS = 2440

# Generate training sequences for use in the model.
def create_sequences(values, time_steps=TIME_STEPS):
    output = []
    for i in range(len(values) - time_steps + 1):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)


x_train = create_sequences(df_training_value.values)
print("Training input shape: ", x_train.shape)

Training input shape:  (1, 2440, 32)


In [20]:
model = keras.Sequential(
    [
        layers.Input(shape=(x_train.shape[1], x_train.shape[2])),
        layers.Conv1D(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1D(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1DTranspose(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(filters=1, kernel_size=7, padding="same"),
    ]
)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 1220, 32)          7200      
                                                                 
 dropout_2 (Dropout)         (None, 1220, 32)          0         
                                                                 
 conv1d_3 (Conv1D)           (None, 610, 16)           3600      
                                                                 
 conv1d_transpose_3 (Conv1DT  (None, 1220, 16)         1808      
 ranspose)                                                       
                                                                 
 dropout_3 (Dropout)         (None, 1220, 16)          0         
                                                                 
 conv1d_transpose_4 (Conv1DT  (None, 2440, 32)         3616      
 ranspose)                                            

In [21]:
model.pop()   # remove output layer (None, 2440, 1) to get the latent representation
latent_rep = model.predict(x_train)[0]  

In [23]:
latent_vec = []
for i in range(ts_df.shape[0]):
    latent_vec.append(latent_rep[i])

latent_vec
clean_df['DateVector'] = latent_vec

In [30]:
timeseries_df = flat_df[["DealDate_t1", "DealDate_t2", "DealDate_t3"]]
timeseries_df

Unnamed: 0,DealDate_t1,DealDate_t2,DealDate_t3
0,"[0.0, 0.0, 0.0, 0.09221074, 0.011161167, 0.002...",,
1,"[0.0, 0.038540773, 0.09711099, 0.22219688, 0.0...",,
2,"[0.13642836, 0.100364886, 0.0, 0.04559175, 0.0...",,
3,"[0.0, 0.5521339, 0.0, 0.0, 0.089344405, 0.0, 0...",,
4,"[0.0, 0.12277673, 0.0, 0.41839737, 0.09081799,...","[0.08290465, 0.0404652, 0.0, 0.05820749, 0.076...","[0.0, 0.24082406, 0.0, 0.0, 0.027381714, 0.044..."
...,...,...,...
988,"[0.102184944, 0.18636757, 0.0, 0.0, 0.02633296...","[0.1419558, 0.06810707, 0.0, 0.12760532, 0.064...","[0.0, 0.036394794, 0.20835912, 0.16356935, 0.0..."
989,"[0.09169653, 0.0, 0.042879775, 0.08549973, 0.1...",,
990,"[0.12692794, 0.16879615, 0.0, 0.0, 0.05778403,...","[0.0, 0.0, 0.23588568, 0.28649843, 0.001226592...",
991,"[0.28570512, 0.66700345, 0.0, 0.08349978, 0.00...",,


In [31]:
ts_embeddings = []
for idx, row in timeseries_df.iterrows():
    temp = []
    if type(row['DealDate_t1']) is not str:
        temp.append(row['DealDate_t1'])
    if type(row['DealDate_t2']) is not str:
        temp.append(row['DealDate_t2'])
    if type(row['DealDate_t3']) is not str:
        temp.append(row['DealDate_t3'])
    ts_embeddings.append(np.mean(temp, axis=0)) 

In [32]:
ts_embeddings = np.stack(ts_embeddings, axis=0)
ts_embeddings

### 2.2 Generate Text Embeddings using DistilBERT

In [44]:
from transformers import BertTokenizer, TFBertModel, BertConfig, TFDistilBertModel, DistilBertTokenizer, DistilBertConfig

In [42]:
# Text preprocessing
def preprocess_sentence(line):
    line = line.replace("$", " dollars ")
    line = line.replace(" GBP ", " pounds ")
    line = re.sub(r"[,']+", "", line)
    line = re.sub(r"[^a-zA-Z0-9. ]+", " ", line)
    line = re.sub(r'[" "]+', " ", line)
    return line

In [43]:
text_features = ['Synopsis_t1', "Synopsis_t2", "Synopsis_t3", "Description", 'FinStatusNote', "Keywords", "Verticals", "Industries", 'BusinessStatus', 'CurrentBusinessStatus',
       'CurrentFinancingStatus', 'LeadInvestor']
for column in text_features:
    flat_df[column] = flat_df[column].map(preprocess_sentence)

In [45]:
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_basic_tokenize=True)
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [46]:
def create_embeddings(column):
    emb = []
    sentences = flat_df[column]
    for sent in sentences:
        sent = dbert_tokenizer.basic_tokenizer.tokenize(sent)
        dbert_inps=dbert_tokenizer.encode_plus(sent, add_special_tokens = True,max_length = 250, pad_to_max_length = False, return_attention_mask = True,truncation=True)
        out=dbert_model(np.asarray(dbert_inps['input_ids']).reshape(1,-1),np.asarray(dbert_inps['attention_mask']).reshape(1,-1))
        emb.append(out[0][:,0,:].numpy())
    return emb

In [None]:
for column in text_features:
    flat_df[column] = create_embeddings(column)

In [48]:
text_df = flat_df[text_features].to_numpy()
text_df

In [49]:
text_embeddings = np.mean(text_df, axis = 1)
text_embeddings = [emb.flatten() for emb in text_embeddings]
text_embeddings = np.stack(text_embeddings, axis = 0)
text_embeddings

### 2.3 Generate Numerical Features

In [50]:
from sklearn.preprocessing import MinMaxScaler

In [51]:
scaler = MinMaxScaler()

In [52]:
num_features = ["EmployeeGrowth", "CurrentEmployees", "TotalRaised", "LastDealNo", 'PrevVC(Up/Down)', 'AvgDealSize', 'AvgEquity', 'PrevEquity',
                'AvgVCRound', 'NumFollowOnInvestors']

In [53]:
flat_df[num_features] = scaler.fit_transform(flat_df[num_features])

In [54]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

x_train = flat_df[num_features].to_numpy()
y_train = flat_df.isDeal.to_numpy().astype('int32')

model = Sequential()
model.add(Dense(64, input_dim=10, activation='relu'))
model.add(Dropout(0.5)) 
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5)) 
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',  
              metrics=['accuracy'])

model.fit(x_train, y_train,
          epochs=10,
          batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb02bf80520>

In [55]:
model2 = Sequential()
model2.add(Dense(64, input_dim = 10, weights=model.layers[0].get_weights()))
model2.add(Activation('tanh'))

In [56]:
num_embeddings = model2.predict(flat_df[num_features].to_numpy())

### 2.4 Concatenate Latent Feature Vectors

In [105]:
predict_features = np.concatenate((ts_embeddings, text_embeddings, num_embeddings), axis = 1)

In [106]:
ts_embeddings.shape, num_embeddings.shape, text_embeddings.shape, predict_features.shape

((993, 32), (993, 64), (993, 768), (993, 864))

In [107]:
target = flat_df.isDeal.to_numpy()

In [108]:
df_vectors = pd.DataFrame(predict_features)
df_vectors["Company"] = flat_df["Company"]
df_vectors["isDeal"] = flat_df["isDeal"]
df_vectors = df_vectors.sample(frac=1)
company = df_vectors["Company"]
df_vectors.drop("Company", axis = 1, inplace = True)

## 3. MLP model

In [112]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [171]:
df_pred_features = df_vectors.loc[:, df_vectors.columns != 'isDeal']

In [173]:
train_x = df_pred_features[0:744]
train_y = df_vectors["isDeal"][0:744]

test_x = df_pred_features[744:]
test_y = df_vectors["isDeal"][744:]

In [138]:
train_x.shape, test_y.shape, train_y.shape, test_x.shape

((744, 864), (249,), (744,), (249, 864))

In [139]:
train_x = scaler.fit_transform(train_x)
test_x = scaler.fit_transform(test_x)

### Model 1 : sklearn Perceptron (generic)

In [140]:
Classifier = Perceptron(random_state = 40)

In [141]:
Classifier.fit(train_x, train_y)

Perceptron(random_state=40)

In [142]:
pred = Classifier.predict(test_x)
pred

In [143]:
score = np.round(accuracy_score(test_y, pred), 4)
score

0.8353

In [145]:
confusion_matrix(test_y, pred) 

## Model 2: MLP (as per paper)
(trained with Relu activation; sigmoid applied to feature representation (prediction layer))

In [184]:
model = Sequential()
model.add(Dense(64, input_dim=864, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',   # alt: rmsprop
              metrics=['accuracy'])

model.fit(train_x, train_y,
          epochs=75,
          batch_size=128)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<keras.callbacks.History at 0x7fb0289d3880>

In [185]:
pred_prob = model.predict(test_x)
pred_prob

In [149]:
# Find threshold corresponding to optimal accuracy-recall tradeoff 
for prob in [0.3, 0.25, 0.2, 0.15, 0.1]:
    keras_pred = [1 if val >= prob else 0 for val in pred_prob]
    tn, fp, fn, tp = confusion_matrix(test_y, keras_pred).ravel()
    #tn, fp, fn, tp
    print(tn, fp, fn, tp, round(tp/(tp+fn),4), (tp/(tp + 0.5*(fp+fn))))

210 0 39 0 0.0 0.0
210 0 39 0 0.0 0.0
207 3 37 2 0.0513 0.09090909090909091
179 31 24 15 0.3846 0.35294117647058826
132 78 13 26 0.6667 0.36363636363636365


In [188]:
# Startup is eligible for funding if probability of success is greater than 0.3
keras_pred = [1 if val > 0.2 else 0 for val in pred_prob]
confusion_matrix(test_y, keras_pred)

In [157]:
list(company)[744:]

['Yordex',
 'Landbay',
 'DeFinity (Financial Software)',
 'MoneyExpert',
 'HANetf',
 'FinGo',
 'Voscuris',
 'Payfriendz',
 'Connectd',
 'Kyshi',
 'Cashfac',
 'Plum(Financial Software)',
 'Donr',
 'Pave (Financial Software)',
 'Float (Edinburgh)',
 'Identidot',
 'Kwikpay',
 'Bumper',
 'Blockchain.com',
 'Amero',
 'Osu',
 'Trakti',
 'swIDch',
 'Envelop Risk',
 'Transactive',
 'Arkera',
 'Copper',
 'CIRCA5000',
 'DMA LINK',
 'XPO (Financial Software)',
 'Abaka',
 'Shares',
 'RepOptim',
 'GoodBox (B2C Electronics)',
 'Kibo Finance',
 'DeepView',
 'Argent',
 'NovaFori',
 'iov42',
 'BCB Group',
 'Bite Investments',
 'Unizest',
 'ProsperUs (Financial Software)',
 'Alpima',
 'BOXD',
 'Koodoo (Financial Software)',
 'Zilch',
 'CrowdJustice',
 'Tomato Pay',
 'Weavr',
 'Wagonex',
 'Lunabets',
 'Heyguevara.com',
 'Rossum',
 'Offgrid.finance',
 'Apperio',
 'Web3Games',
 'Ophelos',
 'Twig',
 'Adsum (Business/Productivity Software)',
 'Parfin',
 'Iclima Earth',
 'Apexx',
 'Tulipshare',
 'Uqudo',
 'Ti

In [190]:
predictions = {"CompanyName": list(company)[744:], "PredictedProbability": pred_prob.tolist(), "isFunding":test_y.to_list()}
predictions_df = pd.DataFrame(predictions)

In [205]:
predictions_df = predictions_df.sort_values(by=['PredictedProbability'], ascending=False)
predictions_df.to_csv("Predictions.csv", header=True)