In [1]:
#Import basic packages
import os
import numpy as np
import pandas as pd
import csv
import regex as re
import datetime as dt 

In [2]:
org_ori_df = pd.read_csv(R"d:/msc-project/data/pre-processed/organisations_preprocessed.csv",encoding='utf-8',
                         index_col='org_uuid',
                         parse_dates= ['founded_on','went_public_on','acquired_on','closed_on'])

In [3]:
org_ori_df.columns

Index(['name', 'legal_name', 'homepage_url', 'country_code', 'state_code',
       'region', 'city', 'address', 'postal_code', 'status',
       'short_description', 'category_list', 'category_groups_list',
       'num_funding_rounds', 'total_funding_usd', 'total_funding',
       'total_funding_currency_code', 'founded_on', 'last_funding_on',
       'closed_on', 'employee_count', 'email', 'phone', 'facebook_url',
       'linkedin_url', 'twitter_url', 'logo_url', 'primary_role', 'num_exits',
       'description', 'ipo_uuid', 'stock_exchange_symbol', 'stock_symbol',
       'went_public_on', 'share_price_usd', 'share_price',
       'share_price_currency_code', 'valuation_price_usd', 'valuation_price',
       'valuation_price_currency_code', 'money_raised_usd', 'money_raised',
       'money_raised_currency_code', 'acquisition_uuid', 'acquirer_uuid',
       'acquirer_name', 'acquirer_country_code', 'acquirer_state_code',
       'acquirer_region', 'acquirer_city', 'acquisition_type', 'acquired

In [4]:
funding_rounds_df = pd.read_csv(R"d:/msc-project/data/pre-processed/funding_rounds_preprocessed.csv",encoding='utf-8', 
            index_col='funding_round_uuid',
           parse_dates=['announced_on'])

In [5]:
funding_rounds_df.columns

Index(['name', 'country_code', 'state_code', 'region', 'city',
       'investment_type', 'announced_on', 'raised_amount_usd', 'raised_amount',
       'raised_amount_currency_code', 'post_money_valuation_usd',
       'post_money_valuation', 'post_money_valuation_currency_code',
       'investor_count', 'org_uuid', 'org_name', 'lead_investor_uuids'],
      dtype='object')

In [6]:
len(funding_rounds_df)

364336

In [7]:
funding_rounds_df.investment_type.value_counts()

seed                     102870
series_unknown            61708
series_a                  42105
series_b                  21084
grant                     20373
angel                     18847
pre_seed                  17224
debt_financing            14249
private_equity            11158
series_c                   9801
convertible_note           7386
post_ipo_equity            6416
equity_crowdfunding        6315
non_equity_assistance      5381
undisclosed                5296
series_d                   4126
corporate_round            3335
series_e                   1715
post_ipo_debt              1420
product_crowdfunding       1025
secondary_market            784
initial_coin_offering       688
series_f                    611
series_g                    210
series_h                     89
post_ipo_secondary           89
series_i                     19
series_j                     12
Name: investment_type, dtype: int64

In [8]:
#create processed organisation
org_processing_df = org_ori_df[['name','status','founded_on']]

In [9]:
#create groupbyobject for investment_type
investment_type = funding_rounds_df.groupby('investment_type')

In [10]:
#create df of company with seed funding
seed_company_df = investment_type.get_group('seed')[['org_uuid','announced_on']]
seed_company_df.dropna(subset=['announced_on'],inplace=True)
seed_company_df.rename(columns={'announced_on':'seed_date'},inplace=True)
len(seed_company_df)

102870

In [11]:
#drop duplicate of companies with more than one seed date. keep only the first seed date.
seed_company_df.sort_values(by=['seed_date'],inplace=True)
seed_company_df_v1 = seed_company_df.drop_duplicates(subset=['org_uuid'],keep='first')

In [12]:
len(seed_company_df_v1)

78970

In [13]:
seed_company_df_v1.head(3)

Unnamed: 0_level_0,org_uuid,seed_date
funding_round_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1
29c62cb5-bdd3-407a-a60f-a235fe819e90,65d0628c-d1d8-4699-9e61-6ef3200839ab,1950-09-20
24093d19-1f1d-ae85-64fe-557d2d08b368,1e4f199c-363b-451b-a164-f94571075ee5,1968-07-31
60fa2775-2c45-44d2-a649-bba7bf89338e,2e2cbc7a-bb1a-bd6d-3c6d-92a6def15d0a,1969-01-01


In [14]:
#create df of company with series_a funding
series_a_company_df = investment_type.get_group('series_a')[['org_uuid','announced_on']]
series_a_company_df.dropna(subset=['announced_on'],inplace=True)
series_a_company_df.rename(columns={'announced_on':'series_a_date'},inplace=True)
len(series_a_company_df)

42105

In [15]:
#drop companies with more than one series a funding. keep only the first series a date
series_a_company_df.sort_values(by=['series_a_date'],inplace=True)
series_a_company_df_v1 = series_a_company_df.drop_duplicates(subset=['org_uuid'],keep='first')
len(series_a_company_df_v1)

37375

In [16]:
series_a_company_df_v1.head(3)

Unnamed: 0_level_0,org_uuid,series_a_date
funding_round_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1
9184700e-351d-4677-b240-ad2cc3b59d43,79ff0cd7-b392-41e5-80df-870bb2b40d96,1980-09-04
dba36c7a-4142-c372-d426-7784a6de74ee,27bb6fe7-0a5d-01c6-30af-55bd8e94c8e6,1981-09-01
46c353a8-2491-70cc-4b6a-b89a522fefdc,322eee3f-a036-651f-5754-0f8759374699,1982-06-01


In [17]:
#create df of company with series_uknown funding. Possibility of series to be series a
series_unknown_company_df = investment_type.get_group('series_unknown')[['org_uuid','announced_on']]
series_unknown_company_df.dropna(subset=['announced_on'],inplace=True)
series_unknown_company_df.rename(columns={'announced_on':'series_unknown_date'},inplace=True)
len(series_unknown_company_df)

61708

In [18]:
#drop companies with more than one series_unknown funding. keep only the first funding
series_unknown_company_df.sort_values(by=['series_unknown_date'],inplace=True)
series_unknown_company_df_v1 = series_unknown_company_df.drop_duplicates(subset=['org_uuid'],keep='first')
len(series_unknown_company_df_v1)

45102

In [21]:
#merge series a with series unknown
first_funding_df = pd.merge(series_a_company_df_v1,
                           series_unknown_company_df_v1,
                           on='org_uuid',
                           how='left')

In [22]:
#fill NaN series_a date using series_unknown date. Assume series_unknown as series_a
first_funding_df['series_a_date'] = first_funding_df['series_a_date'].fillna(first_funding_df['series_unknown_date'])

In [23]:
#drop series_uknown date
first_funding_df.drop(columns=['series_unknown_date'],inplace=True)

In [24]:
first_funding_df.head(3)

Unnamed: 0,org_uuid,series_a_date
0,79ff0cd7-b392-41e5-80df-870bb2b40d96,1980-09-04
1,27bb6fe7-0a5d-01c6-30af-55bd8e94c8e6,1981-09-01
2,322eee3f-a036-651f-5754-0f8759374699,1982-06-01


In [25]:
#merge with seed fund companies
first_funding_df_v1 = pd.merge (seed_company_df_v1,
                                first_funding_df,
                                on='org_uuid',
                                how='outer')
len(first_funding_df_v1)

104464

In [26]:
first_funding_df_v1.head(3)

Unnamed: 0,org_uuid,seed_date,series_a_date
0,65d0628c-d1d8-4699-9e61-6ef3200839ab,1950-09-20,NaT
1,1e4f199c-363b-451b-a164-f94571075ee5,1968-07-31,NaT
2,2e2cbc7a-bb1a-bd6d-3c6d-92a6def15d0a,1969-01-01,NaT


In [27]:
#drop any company with no first funding
first_funding_df_v1.dropna(subset=['seed_date','series_a_date',],
                           how='all',
                           inplace=True)
len(first_funding_df_v1)

104464

In [28]:
#drop any duplicated round, if any.
first_funding_df_v1.sort_values(by=['seed_date','series_a_date'],inplace=True)
first_funding_df_v1.drop_duplicates(subset=['org_uuid'],keep='first',inplace=True)
len(first_funding_df_v1)

104464

In [29]:
first_funding_df_v1.isnull().sum(axis=0)

org_uuid             0
seed_date        25494
series_a_date    67089
dtype: int64

In [30]:
#merge with main processing df
org_processing_df_v1 = pd.merge(org_processing_df,
                                first_funding_df_v1,
                                on='org_uuid',
                                how='left')
len(org_processing_df_v1)

153048

In [31]:
org_processing_df_v1.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,acquired,2005-06-01,NaT,2005-10-01
1,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,acquired,2004-10-11,NaT,2005-10-28
2,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,ipo,2004-02-04,NaT,2005-05-01


In [33]:
#drop companies with no funding round information
org_processing_df_v1.dropna(subset=['seed_date','series_a_date'],how='all',inplace=True)
len(org_processing_df_v1)

93782

In [35]:
org_processing_df_v1['first_fund_date'] = org_processing_df_v1[['seed_date','series_a_date']].min(axis=1)

In [36]:
org_processing_df_v1.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,acquired,2005-06-01,NaT,2005-10-01,2005-10-01
1,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,acquired,2004-10-11,NaT,2005-10-28,2005-10-28
2,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,ipo,2004-02-04,NaT,2005-05-01,2005-05-01


In [37]:
org_processing_df_v1.isnull().sum(axis=0)

org_uuid               0
name                   0
status                 0
founded_on             0
seed_date          21335
series_a_date      60872
first_fund_date        0
dtype: int64

In [38]:
#add series_b date
org_processing_df_v2 = pd.merge(org_processing_df_v1,
                             investment_type.get_group('series_b')[['org_uuid','announced_on']],
                            on='org_uuid',
                            how = 'left')
#org_processing_df.dropna(subset=['announced_on'],inplace=True)
org_processing_df_v2.rename(columns={'announced_on':'series_b_date'},inplace=True)
len(org_processing_df_v2)

95755

In [39]:
org_processing_df_v2.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,acquired,2005-06-01,NaT,2005-10-01,2005-10-01,2007-01-01
1,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,acquired,2004-10-11,NaT,2005-10-28,2005-10-28,2006-12-01
2,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,ipo,2004-02-04,NaT,2005-05-01,2005-05-01,2006-04-01


In [40]:
#add series c date
org_processing_df_v3 = pd.merge(org_processing_df_v2,
                                investment_type.get_group('series_c')[['org_uuid','announced_on']],
                                on='org_uuid',
                                how = 'left')
org_processing_df_v3.rename(columns={'announced_on':'series_c_date'},inplace=True)
len(org_processing_df_v3)

96722

In [41]:
org_processing_df_v3.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date
0,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,acquired,2005-06-01,NaT,2005-10-01,2005-10-01,2007-01-01,2008-05-19
1,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,acquired,2004-10-11,NaT,2005-10-28,2005-10-28,2006-12-01,2008-09-24
2,5f2b40b8-d1b3-d323-d81a-b7a8e89553d0,Digg,acquired,2004-10-11,NaT,2005-10-28,2005-10-28,2006-12-01,2016-09-13


In [42]:
#drop duplicate company
org_processing_df_v3.sort_values(by=['org_uuid','seed_date','series_a_date','series_b_date','series_c_date'],axis=0,
                                 inplace=True,
                                 ignore_index=True)
org_processing_df_v3.drop_duplicates(subset=['org_uuid'],inplace=True)
len(org_processing_df_v3)

93782

In [43]:
org_processing_df_v4 = pd.merge(org_processing_df_v3,
                                org_ori_df[['num_funding_rounds']],
                                on='org_uuid',
                                how='left')
org_processing_df_v4.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds
0,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,operating,2013-01-01,2013-08-05,NaT,2013-08-05,NaT,NaT,5.0
1,0000d497-c93a-eea3-eeb0-a943dfb4f71e,AutoOffer,operating,2013-01-01,2015-11-17,NaT,2015-11-17,NaT,NaT,2.0
2,0002aa63-f21b-4c54-9495-895a1d09e0d4,86 Repairs,operating,2018-01-01,2019-05-14,NaT,2019-05-14,NaT,NaT,1.0


In [44]:
org_processing_df_v4.isnull().sum(axis=0)

org_uuid                  0
name                      0
status                    0
founded_on                0
seed_date             21335
series_a_date         60872
first_fund_date           0
series_b_date         80627
series_c_date         88166
num_funding_rounds        0
dtype: int64

In [61]:
#create dataframe for acquired companies
org_acquired_df = org_processing_df_v4[org_processing_df_v4['status'] == 'acquired']
len(org_acquired_df)

8932

In [52]:
org_acquired_df.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds
7,000607fc-cea0-535c-6324-e83ba07c8cc7,BioVigilant Systems,acquired,2005-01-01,NaT,2006-05-23,2006-05-23,NaT,NaT,1.0
14,000cff2d-58df-0a9a-97ea-6b73ed9ec601,Termaxia,acquired,2015-01-01,2016-06-22,NaT,2016-06-22,NaT,NaT,2.0
52,002d06bc-916e-b5ae-68a4-13b8fc40055b,Smart Device Media,acquired,2010-01-01,2011-03-08,NaT,2011-03-08,NaT,NaT,1.0


In [63]:
#merge with acquired_on date from original data frame
org_acquired_df_v1 = pd.merge(org_acquired_df,
                             org_ori_df['acquired_on'],
                             on='org_uuid',
                             how='left')
len(org_acquired_df)

8932

In [64]:
#drop company with no acquisition date
org_acquired_df_v1.dropna(subset=['acquired_on'],inplace=True)
len(org_acquired_df_v1)

7717

In [66]:
#calculate the duration from founded to acquisition and drop inconsistent data
org_acquired_df_v1['duration_to_acquisition_days'] = org_acquired_df_v1['acquired_on'] - org_acquired_df_v1['founded_on']

import datetime as dt 
org_acquired_df_v1['duration_to_acquisition_years'] = (org_acquired_df_v1['duration_to_acquisition_days'].dt.days/365).astype(float).round(decimals=1)

#drop company with acquire date before founded date
mask = org_acquired_df_v1.duration_to_acquisition_years > 0
org_acquired_df_v2 = org_acquired_df_v1[mask]
len(org_acquired_df_v2)

7690

In [82]:
#merge with main processing df
org_processing_df_v5 = pd.concat([org_acquired_df_v2[list(org_processing_df_v4.columns)],
                                  org_processing_df_v4[org_processing_df_v4['status'] != 'acquired']],
                                 axis ='index',
                                 ignore_index=True)
len(org_processing_df_v5)

92540

In [83]:
org_processing_df_v5.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds
0,000607fc-cea0-535c-6324-e83ba07c8cc7,BioVigilant Systems,acquired,2005-01-01,NaT,2006-05-23,2006-05-23,NaT,NaT,1.0
1,002d06bc-916e-b5ae-68a4-13b8fc40055b,Smart Device Media,acquired,2010-01-01,2011-03-08,NaT,2011-03-08,NaT,NaT,1.0
2,002e7bde-3d8c-8607-11f8-84181e531b27,Blurble,acquired,2011-01-01,2013-01-01,NaT,2013-01-01,NaT,NaT,1.0


In [84]:
#create df for ipoed companies
org_ipo_df = org_processing_df_v5[org_processing_df_v5['status'] == 'ipo']
len(org_ipo_df)

1265

In [86]:
#merge with announced date from original data frame and drop companies with no went_public_on date
org_ipo_df_v1 = pd.merge(org_ipo_df,
                         org_ori_df['went_public_on'],
                         on='org_uuid',
                         how='left')
org_ipo_df_v1.dropna(subset=['went_public_on'],inplace=True)
len(org_ipo_df_v1)

1265

In [87]:
org_ipo_df_v1.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds,went_public_on
0,002cb90c-69c4-d9fd-4316-4a1f95286e21,Sino-Global Shipping America,ipo,2001-01-01,2019-07-29,NaT,2019-07-29,NaT,NaT,2.0,2008-05-21
1,0055e28d-cb67-4413-8fab-be7b886b6b60,Mechanist Games,ipo,2011-04-07,NaT,2015-01-21,2015-01-21,NaT,NaT,2.0,2016-08-16
2,007ee3e7-8511-44e6-a1c8-0326ede9b1fc,51liucheng.com,ipo,2011-05-26,NaT,2014-10-31,2014-10-31,NaT,NaT,7.0,2016-02-22


In [88]:
#calculate duration from founded to IPO date and drop companies with inconsistent data
org_ipo_df_v1['duration_to_ipo_days'] = org_ipo_df_v1['went_public_on'] - org_ipo_df_v1['founded_on']
org_ipo_df_v1['duration_to_ipo_years'] = (org_ipo_df_v1['duration_to_ipo_days'].dt.days/365).astype(float).round(decimals=1)

mask = org_ipo_df_v1['duration_to_ipo_years'] > 0
org_ipo_df_v2 = org_ipo_df_v1[mask]
len(org_ipo_df_v2)

1243

In [91]:
#concatenate with main data frame
org_processing_df_v6 = pd.concat([org_ipo_df_v2[list(org_processing_df_v5.columns)],
                                 org_processing_df_v5[org_processing_df_v5['status']!='ipo']],
                                axis=0,
                                ignore_index=True)
len(org_processing_df_v6)

92518

In [92]:
org_processing_df_v6.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds
0,002cb90c-69c4-d9fd-4316-4a1f95286e21,Sino-Global Shipping America,ipo,2001-01-01,2019-07-29,NaT,2019-07-29,NaT,NaT,2.0
1,0055e28d-cb67-4413-8fab-be7b886b6b60,Mechanist Games,ipo,2011-04-07,NaT,2015-01-21,2015-01-21,NaT,NaT,2.0
2,007ee3e7-8511-44e6-a1c8-0326ede9b1fc,51liucheng.com,ipo,2011-05-26,NaT,2014-10-31,2014-10-31,NaT,NaT,7.0


In [93]:
#create data frame for closed companies
org_closed_df = org_processing_df_v6[org_processing_df_v6['status'] == 'closed']
len(org_closed_df)

6127

In [109]:
org_closed_df_v1 = pd.merge(org_closed_df,
                           org_ori_df[['closed_on','acquired_on']],
                           on='org_uuid',
                           how='left')
#org_closed_df_v1.dropna(subset=['closed_on'],inplace=True)
len(org_closed_df_v1)

6127

In [115]:
#get acquired companies from companies label as close. The companies are closed after acquired
org_acquired_df_v3 = org_closed_df_v1[org_closed_df_v1['acquired_on'].notna()]
len(org_acquired_df_v3)

431

In [116]:
#change the status to acquired
org_acquired_df_v3['status'] = 'acquired'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [118]:
org_acquired_df_v3.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds,closed_on,acquired_on
15,00771e9e-79d3-9afe-df5a-1d5f26dd30c8,Advanced BioHealing,acquired,2003-01-01,NaT,2004-10-05,2004-10-05,2005-11-21,2007-02-27,3.0,NaT,2011-05-17
30,0109360f-babd-5c36-8047-cf82bd8ce575,ChangeCoin,acquired,2013-01-01,2014-05-05,NaT,2014-05-05,NaT,NaT,2.0,2016-11-18,2016-04-13
56,01eda286-ae1b-f88c-46e1-1f6895445118,GroundLink,acquired,2004-07-10,NaT,2008-04-01,2008-04-01,NaT,NaT,2.0,2020-08-31,2017-01-27


In [119]:
#calculate the duration from founded to acquisition and drop inconsistent data
org_acquired_df_v3['duration_to_acquisition_days'] = org_acquired_df_v3['acquired_on'] - org_acquired_df_v3['founded_on']

import datetime as dt 
org_acquired_df_v3['duration_to_acquisition_years'] = (org_acquired_df_v3['duration_to_acquisition_days'].dt.days/365).astype(float).round(decimals=1)

#drop company with acquire date before founded date
mask = org_acquired_df_v3.duration_to_acquisition_years > 0
org_acquired_df_v4 = org_acquired_df_v3[mask]
len(org_acquired_df_v3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


431

In [122]:
#merge with main processing df
org_processing_df_v7 = pd.concat([org_acquired_df_v4[list(org_processing_df_v6.columns)],
                                  org_processing_df_v6[org_processing_df_v6['status'] != 'closed']],
                                 axis ='index',
                                 ignore_index=True)
len(org_processing_df_v7)

86821

In [125]:
org_processing_df_v7.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds
0,00771e9e-79d3-9afe-df5a-1d5f26dd30c8,Advanced BioHealing,acquired,2003-01-01,NaT,2004-10-05,2004-10-05,2005-11-21,2007-02-27,3.0
1,0109360f-babd-5c36-8047-cf82bd8ce575,ChangeCoin,acquired,2013-01-01,2014-05-05,NaT,2014-05-05,NaT,NaT,2.0
2,01eda286-ae1b-f88c-46e1-1f6895445118,GroundLink,acquired,2004-07-10,NaT,2008-04-01,2008-04-01,NaT,NaT,2.0


In [126]:
#get companies which are closed and not acquired
org_closed_df_v2 = org_closed_df_v1[org_closed_df_v1['acquired_on'].isna()]
len(org_closed_df_v2)

5696

In [127]:
org_closed_df_v2.dropna(subset=['closed_on'],inplace=True)
len(org_closed_df_v2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


3737

In [129]:
#calculate duration from founded to closed and drop companies with inconsistent data
org_closed_df_v2['duration_to_closed_days'] = org_closed_df_v2['closed_on'] - org_closed_df_v2['founded_on']
org_closed_df_v2['duration_to_closed_years'] = (org_closed_df_v2['duration_to_closed_days'].dt.days/365).astype(float).round(decimals=1)

mask = org_closed_df_v2['duration_to_closed_years'] > 0
org_closed_df_v3 = org_closed_df_v2[mask]
len(org_closed_df_v3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


3724

In [131]:
#merge closed companies with main data frame
org_processing_df_v8 = pd.concat([org_closed_df_v3[list(org_processing_df_v7)],
                                  org_processing_df_v7],
                                  axis=0,
                                  ignore_index=True)
len(org_processing_df_v8)

90545

In [132]:
org_processing_df_v8.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds
0,00107fd1-b65b-85cd-7d60-c00c4b2ae2fb,CloudAptitude,closed,2012-01-28,2012-01-29,NaT,2012-01-29,NaT,NaT,1.0
1,0013719b-13cb-9419-c604-597100dd642f,Win Win Slots,closed,2012-06-06,2013-01-01,NaT,2013-01-01,NaT,NaT,1.0
2,001c3ed9-6eb1-63b6-eeba-dcdb97f9444f,PetaData Labs SocialDNA,closed,2013-01-10,2014-05-12,NaT,2014-05-12,NaT,NaT,1.0


In [134]:
#compute duration from founded to first fund date
org_processing_df_v8['duration_founded_to_first_fund'] = ((org_processing_df_v8['first_fund_date'] - org_processing_df_v8['founded_on']).dt.days/365).astype(float).round(2)
org_processing_df_v8.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds,duration_founded_to_first_fund
0,00107fd1-b65b-85cd-7d60-c00c4b2ae2fb,CloudAptitude,closed,2012-01-28,2012-01-29,NaT,2012-01-29,NaT,NaT,1.0,0.0
1,0013719b-13cb-9419-c604-597100dd642f,Win Win Slots,closed,2012-06-06,2013-01-01,NaT,2013-01-01,NaT,NaT,1.0,0.57
2,001c3ed9-6eb1-63b6-eeba-dcdb97f9444f,PetaData Labs SocialDNA,closed,2013-01-10,2014-05-12,NaT,2014-05-12,NaT,NaT,1.0,1.33


In [135]:
#drop companies with first fund before founded
org_processing_df_v9 = org_processing_df_v8[org_processing_df_v8['duration_founded_to_first_fund'] >=0]
len(org_processing_df_v9)

89741

In [136]:
#drop companies received first fund after 4 years
org_processing_df_v10 = org_processing_df_v9[org_processing_df_v9['duration_founded_to_first_fund'] <=4]
len(org_processing_df_v10)

76806

In [137]:
org_processing_df_v10.head(3)

Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds,duration_founded_to_first_fund
0,00107fd1-b65b-85cd-7d60-c00c4b2ae2fb,CloudAptitude,closed,2012-01-28,2012-01-29,NaT,2012-01-29,NaT,NaT,1.0,0.0
1,0013719b-13cb-9419-c604-597100dd642f,Win Win Slots,closed,2012-06-06,2013-01-01,NaT,2013-01-01,NaT,NaT,1.0,0.57
2,001c3ed9-6eb1-63b6-eeba-dcdb97f9444f,PetaData Labs SocialDNA,closed,2013-01-10,2014-05-12,NaT,2014-05-12,NaT,NaT,1.0,1.33


In [139]:
#Calculate duration from first fund to next funding round
funding_rounds = ['series_a','series_b','series_c']

for invest_round in funding_rounds:
    new_column = str('duration_first_fund_to_' + invest_round)
    exist_column = str (invest_round+'_date')
    org_processing_df_v10[new_column] = ((org_processing_df_v10[exist_column] - org_processing_df_v10['first_fund_date']).dt.days/365).astype(float).round(2)

org_processing_df_v10.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,org_uuid,name,status,founded_on,seed_date,series_a_date,first_fund_date,series_b_date,series_c_date,num_funding_rounds,duration_founded_to_first_fund,duration_first_fund_to_series_a,duration_first_fund_to_series_b,duration_first_fund_to_series_c
0,00107fd1-b65b-85cd-7d60-c00c4b2ae2fb,CloudAptitude,closed,2012-01-28,2012-01-29,NaT,2012-01-29,NaT,NaT,1.0,0.0,,,
1,0013719b-13cb-9419-c604-597100dd642f,Win Win Slots,closed,2012-06-06,2013-01-01,NaT,2013-01-01,NaT,NaT,1.0,0.57,,,
2,001c3ed9-6eb1-63b6-eeba-dcdb97f9444f,PetaData Labs SocialDNA,closed,2013-01-10,2014-05-12,NaT,2014-05-12,NaT,NaT,1.0,1.33,,,


In [142]:
#assign dependent variable, 1 for success and 0 for non-success
org_processing_df_v10.reset_index(drop=True,inplace=True)
outcome = list()
end = len(org_processing_df_v10)

for i in range(0,end):
    company = org_processing_df_v10.loc[i]
    #set success condition for operating company
    #company got funding series_a in within 3 years from first funding, given first funding is seed fund
    cond1 = company['duration_first_fund_to_series_a'] <= 3 and company['duration_first_fund_to_series_a'] > 0
    #company got funding series_b 3 years from first funding
    cond2 = company['duration_first_fund_to_series_b'] <= 3 and company['duration_first_fund_to_series_b'] > 0 
    #company got funding series_c within 3 years from first funding
    cond3 = company['duration_first_fund_to_series_c'] <= 3 and company['duration_first_fund_to_series_c'] > 0
    
    #for company with closed status
    if company['status'] == 'closed':
        outcome.append(0)
    elif company ['status'] == 'acquired' or company['status'] == 'ipo':
        outcome.append (1)
    elif company ['status'] == 'operating':
        if cond1 or cond2 or cond3:
            outcome.append(1)
        else:
            outcome.append(0)
            
org_processing_df_v10['outcome'] = outcome

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [143]:
org_processing_df_v10['outcome'].value_counts()

0    58022
1    18784
Name: outcome, dtype: int64

In [145]:
success = (org_processing_df_v10[org_processing_df_v10.outcome == 1]).groupby('status').size().astype(int)
failure = (org_processing_df_v10[org_processing_df_v10.outcome == 0].groupby('status').size()).astype(int)

pd.concat ([success,failure], axis=1,keys = ['success','failure'])

Unnamed: 0_level_0,success,failure
status,Unnamed: 1_level_1,Unnamed: 2_level_1
acquired,7121.0,
ipo,871.0,
operating,10792.0,54486.0
closed,,3536.0


In [146]:
org_processing_df_v10.groupby('status').size()

status
acquired      7121
closed        3536
ipo            871
operating    65278
dtype: int64

In [147]:
len(org_processing_df_v10)

76806

In [148]:
#save processed dataframe with outcome
org_processing_df_v10.to_csv(R"d:/msc-project/data/final/outcome_final.csv",encoding='utf-8',index=False)