In [1]:
import pandas as pd
import numpy as np

## import and process deals.csv

In [2]:
deals = pd.read_csv('investmentsUK\deals.csv')    #import data
deals = deals.drop(columns=['Beauhurst company URL','Beauhurst deal URL','Amount raised (converted to GBP)'])    #drop useless information
deals = deals.rename(columns={'Verified investment amount (converted to GBP)':'Investment amount'})    #rename column
deals = deals.dropna()    #clean missing data rows
deals = deals.set_index('Company name').sort_index()    #set company name as index and sort the data by company name
print(deals)

                   Deal date  Investment amount
Company name                                   
10 Digital Media  2015-11-16          3398454.0
1066 Airsoft      2016-05-31            23668.0
14M Genomics      2014-12-16          5000000.0
1855              2017-06-30            20000.0
1st Impression    2013-11-22            75005.0
...                      ...                ...
yWasteFood        2016-09-09            50000.0
âcasă             2014-07-21           157791.0
âcasă             2018-06-20          3286339.0
âcasă             2015-10-28           799989.0
âcasă             2017-10-31           367361.0

[7800 rows x 2 columns]


## import and process liquidity.csv

In [3]:
liquidity = pd.read_csv('investmentsUK\liquidity.csv')    #import data
liquidity = liquidity.drop(columns=['Beauhurst URL','Companies House Number'])    #drop useless information
liquidity = liquidity.rename(columns={"Date of the company's exit or death":"Exit date","Current Stage of Evolution":"Current stage"})    #rename column
liquidity = liquidity.set_index('Company name').sort_index()    #set company name as index and sort the data by company name
print(liquidity)

                 Incorporation date Current stage   Exit date  Exit price
Company name                                                             
10 Digital Media                NaN          Dead  27/11/2018         0.0
10 Minutes With          19/03/2013          Dead  29/06/2017         0.0
1066 Airsoft             13/05/2014          Dead  27/02/2019         0.0
14M Genomics             09/01/2014          Dead  21/03/2016         0.0
1855                     27/02/2013          Dead  20/02/2020         0.0
...                             ...           ...         ...         ...
wayve                    01/10/2013          Dead  16/08/2017         0.0
whatleads.to             10/12/2013          Dead  16/03/2017         0.0
wnDirect                 14/07/2011        Exited  04/01/2017         NaN
yWasteFood               27/11/2015          Dead  20/11/2018         0.0
âcasă                    19/07/2013          Dead  18/03/2020         0.0

[4218 rows x 4 columns]


## left join deals.csv with liquidity.csv

In [9]:
startups = pd.merge(deals, liquidity, on='Company name')    #left join deals.csv with liquidity.csv which have been processed
startups.loc[(startups['Current stage'] == "Dead") ,'Exit price'] = 0    #put zero in the 'Exit price' column if the startup is dead
startups = startups.dropna(subset=['Exit price'])    #clean data rows whose values in "Exit price" are missing
print(startups)
# startups.info()

                   Deal date  Investment amount Incorporation date  \
Company name                                                         
10 Digital Media  2015-11-16          3398454.0                NaN   
1066 Airsoft      2016-05-31            23668.0         13/05/2014   
14M Genomics      2014-12-16          5000000.0         09/01/2014   
1855              2017-06-30            20000.0         27/02/2013   
1st Impression    2013-11-22            75005.0         26/08/2009   
...                      ...                ...                ...   
yWasteFood        2016-09-09            50000.0         27/11/2015   
âcasă             2014-07-21           157791.0         19/07/2013   
âcasă             2018-06-20          3286339.0         19/07/2013   
âcasă             2015-10-28           799989.0         19/07/2013   
âcasă             2017-10-31           367361.0         19/07/2013   

                 Current stage   Exit date  Exit price  
Company name                    

In [10]:
startups.to_csv(r'startups.csv')    #export startups.csv

## import and process shares.csv

In [11]:
shares = pd.read_csv('investmentsUK\shares.csv')    #import data
shares = shares.loc[:,['Company name', 'first_name', 'last_name', 'number_of_shares']]    #select and retain useful information
shares = shares.dropna(subset=['number_of_shares','last_name'])    #clean data rows whose values in "number_of_shares" or "last_name" are missing
shares['first_name'] = shares['first_name'].fillna('')    #replace Nan to "" in "first_name" column
shares['Investor name'] = shares['first_name'].str.cat(shares['last_name'], sep='_')    #splice two columns into "Investor name"
shares = shares.drop(columns=['first_name','last_name'])    #drop useless information
shares = shares.drop_duplicates(subset=None, keep='first', inplace=False)    #eliminate duplicate data rows
investors = shares.set_index('Investor name').sort_values(by=['Investor name', 'number_of_shares'])    
#set Investor name as index and sort the data first by 'Investor name' then by 'number_of_shares'
print(investors)

                                         Company name  number_of_shares
Investor name                                                          
A DAVID_MCMEEKIN                Pulmagen Therapeutics           50000.0
A DAVID_MCMEEKIN                Pulmagen Therapeutics           73233.0
A DONALD_MACDONALD                           Critiqom           15178.0
A E_BODY                                   UKRD Group           43644.0
A F_MORRISON                                     Volo          678033.0
...                                               ...               ...
_ZYNGA GAMES INTERNATIONAL LTD          NaturalMotion         8314837.0
_ZYNGA GAMES INTERNATIONAL LTD          NaturalMotion        11510310.0
_ZYNGA GAMES INTERNATIONAL LTD          NaturalMotion        21120925.0
_ZYNGA GAMES INTERNATIONAL LTD          NaturalMotion        44646072.0
_ZYNGA GAMES INTERNATIONAL LTD          NaturalMotion        47508454.0

[87273 rows x 2 columns]


In [99]:
investors.to_csv(r'investors.csv')    #export startups.csv