In [1]:
#Import basic packages
import os
import numpy as np
import pandas as pd
import csv
import regex as re
import datetime as dt 

In [2]:
org_ori_df = pd.read_csv(R"d:/msc-project/data/pre-processed/organisations_preprocessed.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['founded_on','went_public_on','acquired_on','closed_on'])

In [3]:
org_ori_df.columns

Index(['org_uuid', 'name', 'legal_name', 'homepage_url', 'country_code',
       'state_code', 'region', 'city', 'address', 'postal_code', 'status',
       'short_description', 'category_list', 'category_groups_list',
       'num_funding_rounds', 'total_funding_usd', 'total_funding',
       'total_funding_currency_code', 'founded_on', 'last_funding_on',
       'closed_on', 'employee_count', 'email', 'phone', 'facebook_url',
       'linkedin_url', 'twitter_url', 'logo_url', 'primary_role', 'num_exits',
       'description', 'ipo_uuid', 'stock_exchange_symbol', 'stock_symbol',
       'went_public_on', 'share_price_usd', 'share_price',
       'share_price_currency_code', 'valuation_price_usd', 'valuation_price',
       'valuation_price_currency_code', 'money_raised_usd', 'money_raised',
       'money_raised_currency_code', 'acquisition_uuid', 'acquirer_uuid',
       'acquirer_name', 'acquirer_country_code', 'acquirer_state_code',
       'acquirer_region', 'acquirer_city', 'acquisition_type

In [4]:
funding_rounds_df = pd.read_csv(R"d:/msc-project/data/pre-processed/funding_rounds_preprocessed.csv",encoding='utf-8', 
            index_col=False,
           parse_dates=['announced_on'])

In [5]:
#create processed organisation
org_processing_df = org_ori_df[['org_uuid','name','status','founded_on']]

In [6]:
#create groupbyobject for investment_type
investment_type = funding_rounds_df.groupby('investment_type')

In [7]:
#filter to company with seed round 
org_processing_df = pd.merge(org_processing_df,
                             investment_type.get_group('seed')[['org_uuid','announced_on']],
                            on='org_uuid',
                            how = 'left')
#org_processing_df.dropna(subset=['announced_on'],inplace=True)
org_processing_df.rename(columns={'announced_on':'seed_date'},inplace=True)

In [8]:
len(org_processing_df)

176253

In [9]:
#filter to company with series_a
org_processing_df = pd.merge(org_processing_df,
                             investment_type.get_group('series_a')[['org_uuid','announced_on']],
                            on='org_uuid',
                            how = 'left')
#org_processing_df.dropna(subset=['announced_on'],inplace=True)
org_processing_df.rename(columns={'announced_on':'series_a_date'},inplace=True)
len(org_processing_df)

181779

In [10]:
#filter to company with series_b
org_processing_df = pd.merge(org_processing_df,
                             investment_type.get_group('series_b')[['org_uuid','announced_on']],
                            on='org_uuid',
                            how = 'left')
#org_processing_df.dropna(subset=['announced_on'],inplace=True)
org_processing_df.rename(columns={'announced_on':'series_b_date'},inplace=True)
len(org_processing_df)

184898

In [11]:
#combine with series_c information but do not drop companies without series_c
org_processing_df = pd.merge(org_processing_df,
                             investment_type.get_group('series_c')[['org_uuid','announced_on']],
                            on='org_uuid',
                            how = 'left')
org_processing_df.rename(columns={'announced_on':'series_c_date'},inplace=True)
len(org_processing_df)

186337

In [12]:
#drop duplicate company
org_processing_df.sort_values(by=['org_uuid','seed_date','series_a_date','series_b_date','series_c_date'],axis=0,
                              inplace=True,
                             ignore_index=True)
org_processing_df.drop_duplicates(subset=['org_uuid'],inplace=True)
len(org_processing_df)

153048

In [13]:
org_processing_df = pd.merge(org_processing_df,
                    org_ori_df[['org_uuid','num_funding_rounds']],
                    on='org_uuid',
                    how='left')
org_processing_df.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,series_b_date,series_c_date,num_funding_rounds
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,operating,2016-11-01,NaT,NaT,NaT,NaT,7.0
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,operating,2013-01-01,2013-08-05,NaT,NaT,NaT,5.0
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,operating,2001-01-01,NaT,NaT,NaT,NaT,1.0


In [14]:
org_processing_df.isnull().sum(axis=0)

org_uuid                   0
name                       0
status                     0
founded_on                 0
seed_date              80601
series_a_date         120138
series_b_date         137685
series_c_date         146305
num_funding_rounds         0
dtype: int64

In [15]:
#calculate duration from founded to acquired for company with acquired status
status = org_ori_df.groupby('status')
org_acquired_df = status.get_group('acquired')
org_acquired_df.dropna(subset = ['acquired_on'],inplace=True)
org_acquired_df['duration_to_acquisition_days'] = org_acquired_df['acquired_on'] - org_acquired_df['founded_on']

import datetime as dt 
org_acquired_df['duration_to_acquisition_years'] = (org_acquired_df['duration_to_acquisition_days'].dt.days/365).astype(float).round(decimals=1)

#filter inconsistent data
mask = org_acquired_df.duration_to_acquisition_years > 0
org_acquired_df_v1 = org_acquired_df[mask]

#merge with main data frame
org_processing_df = pd.merge(org_processing_df,
                            org_acquired_df_v1[['org_uuid','duration_to_acquisition_years']],
                            on='org_uuid',
                            how='left')
len(org_processing_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


153048

In [16]:
org_processing_df.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,series_b_date,series_c_date,num_funding_rounds,duration_to_acquisition_years
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,operating,2016-11-01,NaT,NaT,NaT,NaT,7.0,
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,operating,2013-01-01,2013-08-05,NaT,NaT,NaT,5.0,
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,operating,2001-01-01,NaT,NaT,NaT,NaT,1.0,


In [17]:
#calculate duration from founded to ipo for company with ipo status
status = org_ori_df.groupby('status')
org_ipo_df = status.get_group('ipo')
org_ipo_df.dropna(subset = ['went_public_on'],inplace=True)
org_ipo_df['duration_to_ipo_days'] = org_ipo_df['went_public_on'] - org_ipo_df['founded_on']

org_ipo_df['duration_to_ipo_years'] = (org_ipo_df['duration_to_ipo_days'].dt.days/365).astype(float).round(decimals=1)

mask = org_ipo_df['duration_to_ipo_years'] > 0
org_ipo_df_v1 = org_ipo_df[mask]

#merge with main data frame
org_processing_df = pd.merge(org_processing_df,
                            org_ipo_df_v1[['org_uuid','duration_to_ipo_years']],
                            on='org_uuid',
                            how='left')
len(org_processing_df)

#filter IPO duration for success definition 
#mask1 = org_ipo_df_v1['duration_to_ipo_years'] < 8
#mask2 = org_ipo_df_v1['duration_to_ipo_years'] > 4
#org_ipo_df_v1[mask1 & mask2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


153048

In [18]:
org_processing_df.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,series_b_date,series_c_date,num_funding_rounds,duration_to_acquisition_years,duration_to_ipo_years
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,operating,2016-11-01,NaT,NaT,NaT,NaT,7.0,,
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,operating,2013-01-01,2013-08-05,NaT,NaT,NaT,5.0,,
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,operating,2001-01-01,NaT,NaT,NaT,NaT,1.0,,


In [19]:
#calculate duration from founded to closed for company with closed status
org_closed_df = status.get_group('closed')
org_closed_df.dropna(subset = ['closed_on'],inplace=True)
org_closed_df['duration_to_closed_days'] = org_closed_df['closed_on'] - org_closed_df['founded_on']

org_closed_df['duration_to_closed_years'] = (org_closed_df['duration_to_closed_days'].dt.days/365).astype(float).round(decimals=1)

mask = org_closed_df['duration_to_closed_years'] > 0
org_closed_df_v1 = org_closed_df[mask]

#merge with main data frame
org_processing_df = pd.merge(org_processing_df,
                            org_closed_df_v1[['org_uuid','duration_to_closed_years']],
                            on='org_uuid',
                            how='left')
len(org_processing_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


153048

In [20]:
org_processing_df.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,series_b_date,series_c_date,num_funding_rounds,duration_to_acquisition_years,duration_to_ipo_years,duration_to_closed_years
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,operating,2016-11-01,NaT,NaT,NaT,NaT,7.0,,,
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,operating,2013-01-01,2013-08-05,NaT,NaT,NaT,5.0,,,
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,operating,2001-01-01,NaT,NaT,NaT,NaT,1.0,,,


In [21]:
#calculate duration from founded to 31-12-2020 for company with operating status
org_operating_df = status.get_group('operating')
org_operating_df['duration_to_current_days'] = pd.Timestamp('2020-12-31') - org_operating_df['founded_on']

org_operating_df['duration_to_current_years'] = (org_operating_df['duration_to_current_days'].dt.days/365).astype(float).round(decimals=1)

#merge with main data frame
org_processing_df = pd.merge(org_processing_df,
                            org_operating_df[['org_uuid','duration_to_current_years']],
                            on='org_uuid',
                            how='left')
len(org_processing_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


153048

In [22]:
org_processing_df.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,series_b_date,series_c_date,num_funding_rounds,duration_to_acquisition_years,duration_to_ipo_years,duration_to_closed_years,duration_to_current_years
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,operating,2016-11-01,NaT,NaT,NaT,NaT,7.0,,,,4.2
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,operating,2013-01-01,2013-08-05,NaT,NaT,NaT,5.0,,,,8.0
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,operating,2001-01-01,NaT,NaT,NaT,NaT,1.0,,,,20.0


In [23]:
#drop companies with incomplete information on duration
org_processing_df.dropna (subset=['duration_to_acquisition_years','duration_to_ipo_years',
                                'duration_to_closed_years','duration_to_current_years'],
                            how='all',
                            inplace=True)
len(org_processing_df)

147840

In [24]:
org_processing_df.head(5)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,series_b_date,series_c_date,num_funding_rounds,duration_to_acquisition_years,duration_to_ipo_years,duration_to_closed_years,duration_to_current_years
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,operating,2016-11-01,NaT,NaT,NaT,NaT,7.0,,,,4.2
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,operating,2013-01-01,2013-08-05,NaT,NaT,NaT,5.0,,,,8.0
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,operating,2001-01-01,NaT,NaT,NaT,NaT,1.0,,,,20.0
3,0000d497-c93a-eea3-eeb0-a943dfb4f71e,AutoOffer,operating,2013-01-01,2015-11-17,NaT,NaT,NaT,2.0,,,,8.0
4,0001a8cc-0cdc-4a30-b4d3-da1b425069e1,SnapClarity,acquired,2016-01-01,NaT,NaT,NaT,NaT,3.0,4.5,,,


In [25]:
#Calculate duration from founded to funding rounds
funding_rounds = ['seed','series_a','series_b','series_c']

for invest_round in funding_rounds:
    new_column = str('duration_to_' + invest_round + '_years')
    exist_column = str (invest_round+'_date')
    org_processing_df[new_column] = ((org_processing_df[exist_column] - org_processing_df ['founded_on']).dt.days/365).astype(float).round(2)

org_processing_df.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,series_b_date,series_c_date,num_funding_rounds,duration_to_acquisition_years,duration_to_ipo_years,duration_to_closed_years,duration_to_current_years,duration_to_seed_years,duration_to_series_a_years,duration_to_series_b_years,duration_to_series_c_years
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,operating,2016-11-01,NaT,NaT,NaT,NaT,7.0,,,,4.2,,,,
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,operating,2013-01-01,2013-08-05,NaT,NaT,NaT,5.0,,,,8.0,0.59,,,
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,operating,2001-01-01,NaT,NaT,NaT,NaT,1.0,,,,20.0,,,,


In [26]:
org_processing_df.columns

Index(['org_uuid', 'name', 'status', 'founded_on', 'seed_date',
       'series_a_date', 'series_b_date', 'series_c_date', 'num_funding_rounds',
       'duration_to_acquisition_years', 'duration_to_ipo_years',
       'duration_to_closed_years', 'duration_to_current_years',
       'duration_to_seed_years', 'duration_to_series_a_years',
       'duration_to_series_b_years', 'duration_to_series_c_years'],
      dtype='object')

In [27]:
# delete companies with series C funding during warm-up window
mask1 = org_processing_df['duration_to_series_c_years'] > 4
mask2 = org_processing_df['duration_to_series_c_years'].isna()
org_processing_df = org_processing_df[mask1 | mask2]
len(org_processing_df)

146062

In [28]:
# delete companies acquired during warm-up window
mask1 = org_processing_df['duration_to_acquisition_years'] > 4
mask2 = org_processing_df['duration_to_acquisition_years'].isna()
org_processing_df = org_processing_df[mask1 | mask2]
len(org_processing_df)

143297

In [29]:
# delete companies have IPO during warm-up window
mask1 = org_processing_df['duration_to_ipo_years'] > 4
mask2 = org_processing_df['duration_to_ipo_years'].isna()
org_processing_df = org_processing_df[mask1 | mask2]
len(org_processing_df)

142245

In [30]:
#assign dependent variable, 1 for success and 0 for non-success
org_processing_df.reset_index(drop=True,inplace=True)
end = len(org_processing_df)
outcome = [0]*end

for i in range (0,end):
    company = org_processing_df.loc[i]
    #set success condition
    #company IPOed in simulation window
    cond1 = company['duration_to_ipo_years'] <= 7
    #company acquired in simulation window
    cond2 = company['duration_to_acquisition_years'] <= 7  
    #company got seed funding in simulation window
    cond3 = company['duration_to_seed_years'] <= 7 and company['duration_to_seed_years'] > 4 
    #company got funding series_a in simulation
    cond4 = company['duration_to_series_a_years'] <= 7 and company['duration_to_series_a_years'] > 4
    #company got funding series_b in simulation window
    cond5 = company['duration_to_series_b_years'] <= 7 and company['duration_to_series_b_years'] > 4
    #company got funding series_c in simulation window
    cond6 = company['duration_to_series_c_years'] <= 7
    
    #for company with closed status
    if company['status'] == 'closed':
        #check if company been acquired or IPOed in simulation window. Assign 1 if yes, maintain 0 if not.
        if cond1 or cond2:
            outcome[i] = 1
    
    #for company with ipo, acquired and, operating status        
    else:
        #assign 1 if company meet any of success criteria above and maintain 0 if not. 
        if cond1 or cond2 or cond3 or cond4 or cond5 or cond6:
            outcome[i] = 1
            
org_processing_df['outcome'] = outcome

In [31]:
org_processing_df['outcome'].value_counts()

0    121666
1     20579
Name: outcome, dtype: int64

In [32]:
success = (org_processing_df[org_processing_df.outcome == 1]).groupby('status').size().astype(int)
failure = (org_processing_df[org_processing_df.outcome == 0].groupby('status').size()).astype(int)

pd.concat ([success,failure], axis=1,keys = ['success','failure'])

Unnamed: 0_level_0,success,failure
status,Unnamed: 1_level_1,Unnamed: 2_level_1
acquired,5104.0,3202
ipo,1070.0,995
operating,14405.0,111852
closed,,5617


In [33]:
#save processed dataframe with outcome
org_processing_df.to_csv(R"d:/msc-project/data/final/outcome_final.csv",encoding='utf-8',index=False)