In [136]:
import pandas as pd
import numpy as np

## import and process deals.csv

In [158]:
deals = pd.read_csv('investmentsUK\deals.csv')    #import data
deals = deals.drop(columns=['Beauhurst company URL','Beauhurst deal URL','Amount raised (converted to GBP)'])    #drop useless information
deals = deals.rename(columns={'Verified investment amount (converted to GBP)':'Investment amount'})    #rename column
deals = deals.dropna()    #clean missing data rows
deals['Deal date'] = pd.to_datetime(deals['Deal date'],format='%Y-%m-%d')
deals = deals.sort_values(by=['Company name'])    #sort the data by company name
# print(deals) [7800 rows x 3 columns]

## import and process liquidity.csv

In [160]:
liquidity = pd.read_csv('investmentsUK\liquidity.csv')    #import data
liquidity = liquidity.drop(columns=['Beauhurst URL','Companies House Number'])    #drop useless information
liquidity = liquidity.rename(columns={"Date of the company's exit or death":"Exit date","Current Stage of Evolution":"Current stage"})    #rename column
# liquidity['Exit date'] = pd.to_datetime(liquidity['Exit date'],format='%d/%m/%Y')
# liquidity['Exit date'] = pd.to_datetime(liquidity['Exit date'],format='%Y-%m-%d')
liquidity = liquidity.sort_values(by=['Company name'])    #sort the data by company name
print(liquidity)
liquidity.info()

          Company name Incorporation date Current stage   Exit date  \
641   10 Digital Media                NaN          Dead  27/11/2018   
1020   10 Minutes With         19/03/2013          Dead  29/06/2017   
3197      1066 Airsoft         13/05/2014          Dead  27/02/2019   
1076      14M Genomics         09/01/2014          Dead  21/03/2016   
3358              1855         27/02/2013          Dead  20/02/2020   
...                ...                ...           ...         ...   
1944             wayve         01/10/2013          Dead  16/08/2017   
2025      whatleads.to         10/12/2013          Dead  16/03/2017   
320           wnDirect         14/07/2011        Exited  04/01/2017   
2855        yWasteFood         27/11/2015          Dead  20/11/2018   
1663             âcasă         19/07/2013          Dead  18/03/2020   

      Exit price  
641          0.0  
1020         0.0  
3197         0.0  
1076         0.0  
3358         0.0  
...          ...  
1944         0

## left join to merge deals.csv and liquidity.csv

In [166]:
startups = pd.merge(deals, liquidity, on='Company name')    #left join deals.csv with liquidity.csv which have been processed
startups.loc[(startups['Current stage'] == "Dead") ,'Exit price'] = 0    #put zero in the 'Exit price' column if the startup is dead
startups = startups.dropna(subset=['Exit price'])    #clean data rows whose values in "Exit price" are missing
startups['Exit date'] = pd.to_datetime(startups['Exit date'],format='%d/%m/%Y')
startups['Exit date'] = pd.to_datetime(startups['Exit date'],format='%Y-%m-%d')
# startups['Incorporation date'] = pd.to_datetime(startups['Incorporation date'],format='%d/%m/%Y')
# startups['Incorporation date'] = pd.to_datetime(startups['Incorporation date'],format='%Y-%m-%d')
print(startups)
startups.info()

          Company name  Deal date  Investment amount Incorporation date  \
0     10 Digital Media 2015-11-16          3398454.0                NaN   
1         1066 Airsoft 2016-05-31            23668.0         13/05/2014   
2         14M Genomics 2014-12-16          5000000.0         09/01/2014   
3                 1855 2017-06-30            20000.0         27/02/2013   
4       1st Impression 2013-11-22            75005.0         26/08/2009   
...                ...        ...                ...                ...   
7452        yWasteFood 2016-09-09            50000.0         27/11/2015   
7453             âcasă 2014-07-21           157791.0         19/07/2013   
7454             âcasă 2018-06-20          3286339.0         19/07/2013   
7455             âcasă 2015-10-28           799989.0         19/07/2013   
7456             âcasă 2017-10-31           367361.0         19/07/2013   

     Current stage  Exit date  Exit price  
0             Dead 2018-11-27         0.0  
1          

In [170]:
count_value=startups['Company name'].value_counts() #select those startups who have raised more than one round funding
print(count_value) 
# Length: 2833
count_value1=count_value[count_value>1]
print(count_value1) 
# Length: 1405
rounds_company=list(count_value1.index)
startups=startups[startups['Company name'].isin(rounds_company)]
startups = startups.sort_values(by=['Company name','Deal date']) 
print(startups)
# [4154 rows x 7 columns]

Houseology            13
NetThings             12
Microtest Matrices    11
Streetlife            11
TenTel                 9
                      ..
WorkinFashion          2
Volaro                 2
Gnodal                 2
FindsYou               2
Each Life              2
Name: Company name, Length: 1405, dtype: int64
Houseology            13
NetThings             12
Microtest Matrices    11
Streetlife            11
TenTel                 9
                      ..
WorkinFashion          2
Volaro                 2
Gnodal                 2
FindsYou               2
Each Life              2
Name: Company name, Length: 1405, dtype: int64
        Company name  Deal date  Investment amount Incorporation date  \
7       201 Software 2016-10-31            29999.0         04/09/2015   
6       201 Software 2017-03-15            29999.0         04/09/2015   
9     2BE Technology 2014-02-14            10000.0         18/12/2013   
8     2BE Technology 2015-04-02            80000.0         18/12

In [171]:
startups.to_csv(r'startups.csv')    #export startups.csv

## import and process shares.csv

In [172]:
shares = pd.read_csv('investmentsUK\shares.csv')    #import data
shares = shares.loc[:,['Company name','filing_date', 'first_name', 'last_name', 'number_of_shares','percent_class','percent_total']]    #select and retain useful information
shares = shares.dropna(subset=['number_of_shares','last_name'])    
shares = shares[~shares['number_of_shares'].isin([0])]    #clean data rows whose values in "number_of_shares" or "last_name" are missing
shares['first_name'] = shares['first_name'].fillna('')    #replace Nan to "" in "first_name" column
shares['Investor name'] = shares['first_name'].str.cat(shares['last_name'], sep='_')    #splice two columns into "Investor name"
shares = shares.drop(columns=['first_name','last_name'])    #drop useless information
shares = shares.drop_duplicates(subset=None, keep='first', inplace=False)    #eliminate duplicate data rows
# shares['total_shares']=shares.number_of_shares/shares.percent_total*100
# shares.round({'total_shares': 2})
# shares = shares.dropna(subset=['total_shares']) 
shares['filing_date'] = pd.to_datetime(shares['filing_date'],format='%d/%m/%Y')
shares['filing_date'] = pd.to_datetime(shares['filing_date'],format='%Y-%m-%d')
shares = shares.sort_values(by=['Company name','filing_date','Investor name', 'number_of_shares'])    
#set Investor name as index and sort the data first by 'Company name' then by 'number_of_shares'
investors = shares.drop_duplicates(subset=['Company name','Investor name','number_of_shares'], keep='first', inplace=False)
#eliminate duplicate data rows whose number_of_shares make no change [179719 rows x 6 columns] --> [87229 rows x 6 columns]
print(investors) 

           Company name filing_date  number_of_shares  percent_class  \
33100  10 Digital Media  2015-06-18             100.0        100.000   
33103  10 Digital Media  2018-06-18             100.0        100.000   
52887   10 Minutes With  2013-03-19         1000000.0        100.000   
52888   10 Minutes With  2014-03-19           96167.0          3.730   
52899   10 Minutes With  2014-03-19           30000.0          1.164   
...                 ...         ...               ...            ...   
85914             âcasă  2019-07-19          141309.0          5.620   
85855             âcasă  2019-07-19          282619.0         11.239   
85917             âcasă  2019-07-19           15701.0          0.624   
85921             âcasă  2020-07-19           31402.0          1.249   
85920             âcasă  2020-07-19           64938.0          2.582   

       percent_total                             Investor name  
33100        100.000                          _SPORTLOBSTER SA  
33103

In [173]:
investors.to_csv(r'investors.csv')    #export investors.csv

## Inner join to merge startups.csv and investors.csv 

In [186]:
df = pd.merge(startups,investors,on='Company name',how='inner')
df = df[['Company name','Deal date','Investment amount','filing_date','Investor name','number_of_shares','percent_class','percent_total','Current stage','Incorporation date','Exit date','Exit price']]
df = df.sort_values(by=['Company name','Investor name','Deal date','filing_date']) 
df=df.loc[(df['Deal date']<df['filing_date'] )]
# drop deals whose deal date cannot match with filing date [151030 rows x 12 columns] --> [117438 rows x 12 columns]
print(df)

        Company name  Deal date  Investment amount filing_date  \
1       201 Software 2016-10-31            29999.0  2017-09-03   
14      201 Software 2017-03-15            29999.0  2017-09-03   
2       201 Software 2016-10-31            29999.0  2017-09-03   
15      201 Software 2017-03-15            29999.0  2017-09-03   
3       201 Software 2016-10-31            29999.0  2017-09-03   
...              ...        ...                ...         ...   
151029         âcasă 2018-06-20          3286339.0  2020-07-19   
150741         âcasă 2014-07-21           157791.0  2018-07-19   
150832         âcasă 2015-10-28           799989.0  2018-07-19   
150923         âcasă 2017-10-31           367361.0  2018-07-19   
151014         âcasă 2018-06-20          3286339.0  2018-07-19   

                 Investor name  number_of_shares  percent_class  \
1                ANDREW_MORSON             406.0          2.056   
14               ANDREW_MORSON             406.0          2.056   
2     

In [187]:
df=df.drop_duplicates(subset=['filing_date','Investor name','Company name'], keep='last') 
#match 'filing_date' with the closest 'Deal date' [117438 rows x 12 columns] --> [38108 rows x 12 columns]
print(df) 

        Company name  Deal date  Investment amount filing_date  \
14      201 Software 2017-03-15            29999.0  2017-09-03   
15      201 Software 2017-03-15            29999.0  2017-09-03   
16      201 Software 2017-03-15            29999.0  2017-09-03   
17      201 Software 2017-03-15            29999.0  2017-09-03   
18      201 Software 2017-03-15            29999.0  2017-09-03   
...              ...        ...                ...         ...   
151028         âcasă 2018-06-20          3286339.0  2020-07-19   
151013         âcasă 2018-06-20          3286339.0  2018-07-19   
150789         âcasă 2015-10-28           799989.0  2017-07-19   
151029         âcasă 2018-06-20          3286339.0  2020-07-19   
151014         âcasă 2018-06-20          3286339.0  2018-07-19   

                 Investor name  number_of_shares  percent_class  \
14               ANDREW_MORSON             406.0          2.056   
15      CHRISTOPHER JOHN_LARGE            1524.0          7.716   
16    

In [188]:
df.to_csv(r'df.csv')    #export df.csv

## Split Always-Follow & Never-Follow Investors

In [196]:
set_n = df[df.duplicated(subset=['Company name','Investor name'], keep=False)==False] #Never-Follow df' Deals
set_n = set_n.sort_values(by=['Company name','Investor name','Deal date','filing_date']) 
print(set_n)
# [25614 rows x 12 columns]
set_a = df[df.duplicated(subset=['Company name','Investor name'], keep=False)==True] #Always-Follow df' Deals
set_a = set_a.sort_values(by=['Company name','Investor name','Deal date','filing_date']) 
print(set_a)
# [12494 rows x 12 columns]

        Company name  Deal date  Investment amount filing_date  \
14      201 Software 2017-03-15            29999.0  2017-09-03   
15      201 Software 2017-03-15            29999.0  2017-09-03   
16      201 Software 2017-03-15            29999.0  2017-09-03   
17      201 Software 2017-03-15            29999.0  2017-09-03   
18      201 Software 2017-03-15            29999.0  2017-09-03   
...              ...        ...                ...         ...   
151028         âcasă 2018-06-20          3286339.0  2020-07-19   
151013         âcasă 2018-06-20          3286339.0  2018-07-19   
150789         âcasă 2015-10-28           799989.0  2017-07-19   
151029         âcasă 2018-06-20          3286339.0  2020-07-19   
151014         âcasă 2018-06-20          3286339.0  2018-07-19   

                 Investor name  number_of_shares  percent_class  \
14               ANDREW_MORSON             406.0          2.056   
15      CHRISTOPHER JOHN_LARGE            1524.0          7.716   
16    

In [197]:
set_n.to_csv(r'set_n.csv')    #export neverfollow.csv
set_a.to_csv(r'set_a.csv')    #export alwaysfollow.csv

## Calculate TVPI of Never-Follow Investors / set_n.csv

In [198]:
set_n['tvpi']=(set_n['percent_total']*set_n['Exit price'])/(set_n['percent_class']*set_n['Investment amount'])
set_n.round({'tvpi': 2})

Unnamed: 0,Company name,Deal date,Investment amount,filing_date,Investor name,number_of_shares,percent_class,percent_total,Current stage,Incorporation date,Exit date,Exit price,tvpi
14,201 Software,2017-03-15,29999.0,2017-09-03,ANDREW_MORSON,406.0,2.056,2.056,Dead,04/09/2015,2019-07-23,0.0,0.0
15,201 Software,2017-03-15,29999.0,2017-09-03,CHRISTOPHER JOHN_LARGE,1524.0,7.716,7.716,Dead,04/09/2015,2019-07-23,0.0,0.0
16,201 Software,2017-03-15,29999.0,2017-09-03,DARREN_FRIEND,406.0,2.056,2.056,Dead,04/09/2015,2019-07-23,0.0,0.0
17,201 Software,2017-03-15,29999.0,2017-09-03,DAVID_EDWARDS,482.0,2.440,2.440,Dead,04/09/2015,2019-07-23,0.0,0.0
18,201 Software,2017-03-15,29999.0,2017-09-03,JAKE_WORRALL,1498.0,7.584,7.584,Dead,04/09/2015,2019-07-23,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
151028,âcasă,2018-06-20,3286339.0,2020-07-19,_PUBAL CONSULT LLP,31402.0,1.249,1.209,Dead,19/07/2013,2020-03-18,0.0,0.0
151013,âcasă,2018-06-20,3286339.0,2018-07-19,_PUBALCONSULT LLP,31402.0,1.349,1.302,Dead,19/07/2013,2020-03-18,0.0,0.0
150789,âcasă,2015-10-28,799989.0,2017-07-19,_SEEDCAMP,54685.0,3.538,3.357,Dead,19/07/2013,2020-03-18,0.0,0.0
151029,âcasă,2018-06-20,3286339.0,2020-07-19,_SEEDCAMP III L.P.,64938.0,2.582,2.500,Dead,19/07/2013,2020-03-18,0.0,0.0


In [199]:
set_n.to_csv(r'set_n.csv')  

## Calculate TVPI of Always-Follow Investors / set_a.sv