In [1]:
#Import basic packages
import os
import numpy as np
import pandas as pd
import csv
import regex as re
import datetime as dt 

In [2]:
#import outcome file with org_uuid
main_df = pd.read_csv(R"d:/msc-project/data/final/outcome_final_v1.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['founded_on','seed_date','series_a_date','series_b_date','series_c_date'])
main_processing_df = main_df[['org_uuid']]

In [3]:
main_processing_df.head(3)

Unnamed: 0,org_uuid
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d
1,9c8adac6-5c8e-9344-b763-6beab966c63c
2,6749cc07-efed-ff09-4efe-43faf6f580de


In [4]:
len(main_processing_df)

28727

In [5]:
#import jobs file
jobs_df = pd.read_csv(R"d:/msc-project/data/pre-processed/jobs_preprocessed.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['started_on','ended_on'])
jobs_df.head(3)

Unnamed: 0,job_uuid,job_name,people_uuid,person_name,org_uuid,org_name,started_on,ended_on,is_current,title,job_type,is_founder
0,697b6934-fc1f-9d63-cfb2-1a10759b378e,Ben Elowitz Co-Founder/CEO/Board of Directors ...,ed13cd36-fe2b-3707-197b-0c2d56e37a71,Ben Elowitz,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,Co-Founder/CEO/Board of Directors,executive,1
1,b1de3765-442e-b556-9304-551c2a055901,Kevin Flaherty VP Marketing @ Wetpaint,5ceca97b-493c-1446-6249-5aaa33464763,Kevin Flaherty,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,VP Marketing,executive,0
2,1319cd30-f5e8-c700-0af6-64029c6f7124,Raju Vegesna Chief Evangelist @ Zoho,9f99a98a-aa97-b30b-0d36-db67c1d277e0,Raju Vegesna,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2000-11-01,,True,Chief Evangelist,employee,0


# Assign Number of founder for each organisation

In [6]:
num_founder = pd.DataFrame(jobs_df.groupby('org_uuid')['is_founder'].sum())
main_processing_df_v1 = pd.merge(main_processing_df,
                         num_founder,
                         on='org_uuid',
                         how='left'
                         )
main_processing_df_v1.rename(columns = {'is_founder':'num_founder'},inplace=True)
main_processing_df_v1.head(3)

Unnamed: 0,org_uuid,num_founder
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0


In [7]:
#assign 0 for companies with no information of founder
main_processing_df_v2 = main_processing_df_v1.fillna(0)

In [8]:
len(main_processing_df_v2)

28727

In [9]:
main_processing_df_v2.head(5)

Unnamed: 0,org_uuid,num_founder
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0.0
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0
3,bbc29f4a-a798-4f74-f0a0-b1d8267692ac,0.0
4,9bbaf540-5994-029a-3346-501ad2d1fb89,2.0


# Assign Gender Diversity Features

In [10]:
founder_df = jobs_df[jobs_df['is_founder'] == 1]
founder_gb = founder_df.groupby('org_uuid')

In [11]:
people_df = pd.read_csv(R"d:/msc-project/data/pre-processed/people_preprocessed.csv",encoding='utf-8',
                         index_col=False)

In [12]:
people_df.columns

Index(['people_uuid', 'people_name', 'first_name', 'last_name', 'gender',
       'country_code', 'state_code', 'region', 'city', 'org_uuid',
       'featured_job_organization_name', 'featured_job_title', 'facebook_url',
       'linkedin_url', 'twitter_url', 'logo_url', 'description'],
      dtype='object')

In [13]:
founder_gender_df = pd.merge(founder_df[['people_uuid','org_uuid']],
                             people_df[['people_uuid','gender']],
                             on='people_uuid',
                             how='left')

In [14]:
founder_gender_df.head(3)

Unnamed: 0,people_uuid,org_uuid,gender
0,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,male
1,a01b8d46-d311-3333-7c34-aa3ae9c03f22,df662812-7f97-0b43-9d3e-12f64f504fbb,male
2,084aaa07-0795-1fe8-9c46-98bbeb02cd64,df662812-7f97-0b43-9d3e-12f64f504fbb,male


In [15]:
len(founder_df)

332569

In [16]:
len(founder_gender_df)

332569

In [17]:
founder_gender_gb = founder_gender_df.groupby('org_uuid')

In [18]:
is_founder_gender_diversity = list()

for org_uuid in list(main_processing_df_v2['org_uuid']):
    founders_gender = list(founder_gender_df[founder_gender_df['org_uuid'] == org_uuid]['gender']) #create list of founder's gender for one organisation
    if 'male' in founders_gender and 'female' in founders_gender: #assign 1 if male and female in gender list
        is_founder_gender_diversity.append(1)
    else:
        is_founder_gender_diversity.append(0)

main_processing_df_v2['is_founder_gender_diversity'] = is_founder_gender_diversity

In [19]:
len(is_founder_gender_diversity)

28727

In [20]:
is_founder_gender_diversity.count(1)

1733

In [21]:
len(main_processing_df_v2)

28727

In [22]:
main_processing_df_v2.head(3)

Unnamed: 0,org_uuid,num_founder,is_founder_gender_diversity
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0,0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0.0,0
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0,0


# Assigning number of founder with degree level, ivy league, and top100 to organisation.

In [23]:
#import processed degree file
deg_df = pd.read_csv(R"d:/msc-project/data/pre-processed/degrees_preprocessed.csv",encoding='utf-8',
                         index_col=False)
deg_df.head(3)

Unnamed: 0,deg_uuid,deg_name,people_uuid,person_name,institution_uuid,institution_name,degree_type,subject,started_on,completed_on,is_completed,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100
0,205fdfd1-ecac-aa43-262f-219f11755f67,MS Mass Communication @ Boston University,4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,1eab62d2-15d9-0db7-930f-2aa77d4688e1,Boston University,MS,Mass Communication,,1992-01-01,True,0,1,0,0,1,0,1
1,1a2ac288-eb99-3318-fde5-1517bc168f51,"BA English, French @ Washington University in...",4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,6ae9957a-8fb4-0ab1-73fa-dd547c4d3da4,Washington University in St. Louis,BA,"English, French",,1990-01-01,True,1,0,0,0,0,0,0
2,b978d338-7ccc-7469-5ce7-ef98c34155ad,MS Internet Technology @ University of Greenwich,7d187b77-94f7-e6cc-6981-d7468db5968f,Sridhar Gundaiah,b5ea73f6-12a3-576d-ae9b-f4169147f974,University of Greenwich,MS,Internet Technology,,2006-01-01,True,0,1,0,0,1,0,0


In [24]:
#merge founder df with main processing to compress size of founder df to only companies in sample
founder_df_v1 = pd.merge(main_processing_df_v2['org_uuid'],
                         founder_df,
                         on='org_uuid',
                         how='left')
len(founder_df_v1)

47496

In [25]:
#merge degree with founder_df
deg_type_df = deg_df[['people_uuid','is_bachelor','is_master','is_phd','is_mba','is_stem','is_ivy_league','is_top_100']]
founder_degree_df = pd.merge(founder_df_v1[['org_uuid','people_uuid']],
                             deg_type_df,
                             on='people_uuid',
                             how='left')
founder_degree_df.fillna(value=0,inplace=True)
len(founder_degree_df)

51787

In [26]:
founder_degree_df.head(3)

Unnamed: 0,org_uuid,people_uuid,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,31dabae0-f52d-7a27-4f76-51f4d4dfa04c,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6749cc07-efed-ff09-4efe-43faf6f580de,d5e595c1-9ccd-4f89-8c9b-c8cc3c17076d,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [27]:
#sum the number of founder with degree classification using groupby method
founder_degree_df_v1 = founder_degree_df[['org_uuid','is_bachelor','is_master','is_phd','is_mba','is_stem',
                                          'is_ivy_league','is_top_100']].groupby('org_uuid').sum()
len(founder_degree_df_v1)

28727

In [28]:
founder_degree_df_v1.head(3)

Unnamed: 0_level_0,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100
org_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00002470-bff7-6226-5800-0ca1b3787b6f,2.0,1.0,0.0,0.0,3.0,0.0,3.0
0000d497-c93a-eea3-eeb0-a943dfb4f71e,1.0,0.0,0.0,0.0,1.0,0.0,0.0
00040dc9-f822-267c-9cf8-1c9b92ca0588,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [29]:
#rename folder to reflect features
new_column_names = list ()

for column_name in founder_degree_df_v1.columns: 
    new_column_names.append('num' + column_name[2:])
    
new_column_dict = dict(zip(founder_degree_df_v1.columns,new_column_names))
founder_degree_df_v2 = founder_degree_df_v1.rename(columns=new_column_dict)

In [30]:
founder_degree_df_v2.describe()

Unnamed: 0,num_bachelor,num_master,num_phd,num_mba,num_stem,num_ivy_league,num_top_100
count,28727.0,28727.0,28727.0,28727.0,28727.0,28727.0,28727.0
mean,0.263759,0.174226,0.046507,0.066244,0.206948,0.038292,0.142061
std,0.617813,0.532028,0.250173,0.290784,0.596197,0.243689,0.517104
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,11.0,22.0,5.0,7.0,16.0,8.0,9.0


In [31]:
founder_degree_df_v2.reset_index(inplace=True)

In [32]:
founder_degree_df_v2.head(3)

Unnamed: 0,org_uuid,num_bachelor,num_master,num_phd,num_mba,num_stem,num_ivy_league,num_top_100
0,00002470-bff7-6226-5800-0ca1b3787b6f,2.0,1.0,0.0,0.0,3.0,0.0,3.0
1,0000d497-c93a-eea3-eeb0-a943dfb4f71e,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,00040dc9-f822-267c-9cf8-1c9b92ca0588,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [33]:
main_processing_df_v3 = pd.merge(main_processing_df_v2,
                                 founder_degree_df_v2,
                                 on='org_uuid',
                                 how = 'left')

In [34]:
main_processing_df_v3.head(3)

Unnamed: 0,org_uuid,num_founder,is_founder_gender_diversity,num_bachelor,num_master,num_phd,num_mba,num_stem,num_ivy_league,num_top_100
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [35]:
#assign 0 to founder with no degree information
main_processing_df_v3.isnull().sum(axis=0)
main_processing_df_v3.fillna(0,inplace=True)

In [36]:
main_processing_df_v3.head(10)

Unnamed: 0,org_uuid,num_founder,is_founder_gender_diversity,num_bachelor,num_master,num_phd,num_mba,num_stem,num_ivy_league,num_top_100
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,bbc29f4a-a798-4f74-f0a0-b1d8267692ac,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9bbaf540-5994-029a-3346-501ad2d1fb89,2.0,0,2.0,0.0,0.0,0.0,2.0,0.0,0.0
5,45165225-a03d-42be-6af1-376341b7b815,1.0,0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
6,ade298e4-0c78-7ce8-4693-b03f82336417,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,5e96c2a8-d04e-0f06-84da-f4bd4943f74d,2.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,db46f01e-b991-9b33-6de6-5a349b5bfdaa,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,540f9a7a-18d7-659a-a6d3-9abe42123df3,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Assign number of advisor

In [37]:
jobs_df['job_type'].value_counts()

executive         967719
employee          395731
board_member      154586
advisor            57657
board_observer      5948
Name: job_type, dtype: int64

In [38]:
jobs_advisor_df = jobs_df[jobs_df['job_type'] == 'advisor']

In [39]:
jobs_advisor_df.head(3)

Unnamed: 0,job_uuid,job_name,people_uuid,person_name,org_uuid,org_name,started_on,ended_on,is_current,title,job_type,is_founder
247,b8902477-3f1d-4147-5bca-f053b3491b75,Brad Gerstner Investor / Advisor / Board Membe...,46daadf8-9ab9-1f94-24e2-f83ad6d4ad7f,Brad Gerstner,5c6f4d9e-dc72-e018-ee21-71cd23f698ad,Farecast,2006-01-01,2008-01-01,False,Investor / Advisor / Board Member,advisor,0
353,8986d170-77a9-a918-fe22-b8de983d1e42,Scott Rafer Advisor @ Dogster,eaadb11b-7423-0d5d-0a9d-7bf93eb64579,Scott Rafer,b683af1f-8dcf-8f82-37d9-88fbadd63e1b,Dogster,,,True,Advisor,advisor,0
444,70e011ab-472a-9f50-77fe-718af9ae9fe6,George Martin Advisor @ iLike,651ba47c-9aef-f811-a39d-d917677aded4,George Martin,3091d3e8-7934-1e90-7c82-7f1c3caf53cb,iLike,,,False,Advisor,advisor,0


In [40]:
advisor_gb = jobs_advisor_df.groupby('org_uuid')

In [41]:
num_advisor = advisor_gb.size()
num_advisor = pd.DataFrame(num_advisor,
                          columns=['num_advisor'])

In [42]:
main_processing_df_v4 = pd.merge(main_processing_df_v3,
                                 num_advisor,
                                 on='org_uuid',
                                 how='left')

In [43]:
main_processing_df_v4.head(3)

Unnamed: 0,org_uuid,num_founder,is_founder_gender_diversity,num_bachelor,num_master,num_phd,num_mba,num_stem,num_ivy_league,num_top_100,num_advisor
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,


In [44]:
main_processing_df_v4['num_advisor'].fillna(0,inplace=True)

In [45]:
main_processing_df_v4.head(3)

Unnamed: 0,org_uuid,num_founder,is_founder_gender_diversity,num_bachelor,num_master,num_phd,num_mba,num_stem,num_ivy_league,num_top_100,num_advisor
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Work Experience

In [46]:
main_processing_df_v4.head(3)

Unnamed: 0,org_uuid,num_founder,is_founder_gender_diversity,num_bachelor,num_master,num_phd,num_mba,num_stem,num_ivy_league,num_top_100,num_advisor
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [47]:
jobs_df.head(3)

Unnamed: 0,job_uuid,job_name,people_uuid,person_name,org_uuid,org_name,started_on,ended_on,is_current,title,job_type,is_founder
0,697b6934-fc1f-9d63-cfb2-1a10759b378e,Ben Elowitz Co-Founder/CEO/Board of Directors ...,ed13cd36-fe2b-3707-197b-0c2d56e37a71,Ben Elowitz,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,Co-Founder/CEO/Board of Directors,executive,1
1,b1de3765-442e-b556-9304-551c2a055901,Kevin Flaherty VP Marketing @ Wetpaint,5ceca97b-493c-1446-6249-5aaa33464763,Kevin Flaherty,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,VP Marketing,executive,0
2,1319cd30-f5e8-c700-0af6-64029c6f7124,Raju Vegesna Chief Evangelist @ Zoho,9f99a98a-aa97-b30b-0d36-db67c1d277e0,Raju Vegesna,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2000-11-01,,True,Chief Evangelist,employee,0


In [48]:
jobs_df_v1= jobs_df [['job_uuid','people_uuid','org_uuid','started_on','ended_on','is_current']]

In [49]:
jobs_df_v1.head(3)

Unnamed: 0,job_uuid,people_uuid,org_uuid,started_on,ended_on,is_current
0,697b6934-fc1f-9d63-cfb2-1a10759b378e,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,,,False
1,b1de3765-442e-b556-9304-551c2a055901,5ceca97b-493c-1446-6249-5aaa33464763,e1393508-30ea-8a36-3f96-dd3226033abd,,,False
2,1319cd30-f5e8-c700-0af6-64029c6f7124,9f99a98a-aa97-b30b-0d36-db67c1d277e0,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,2000-11-01,,True


In [50]:
len(jobs_df_v1)

1581641

In [51]:
#drop jobs without start date. Start date is crucial to determine if jobs is before founder found company. 
jobs_df_v2 = jobs_df_v1#.dropna(subset=['started_on'])
jobs_df_v2.isnull().sum(axis=0)

job_uuid             0
people_uuid          0
org_uuid             0
started_on      797831
ended_on       1288515
is_current           0
dtype: int64

In [52]:
len(jobs_df_v2)

1581641

In [53]:
jobs_df_v3 = jobs_df_v2[jobs_df_v2['started_on'] > '1980-01-01']

In [54]:
jobs_df_v3['started_on'] = pd.to_datetime(jobs_df_v3['started_on'])
jobs_df_v3['ended_on'] = pd.to_datetime(jobs_df_v3['ended_on'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [55]:
len(jobs_df_v3)

779743

In [56]:
org_ori_df = pd.read_csv(R"d:\\msc-project\data\bulk_export_122020\organizations.csv",encoding='utf-8',
                         parse_dates= ['founded_on'])

In [57]:
org_ori_df.columns

Index(['uuid', 'name', 'type', 'permalink', 'cb_url', 'rank', 'created_at',
       'updated_at', 'legal_name', 'roles', 'domain', 'homepage_url',
       'country_code', 'state_code', 'region', 'city', 'address',
       'postal_code', 'status', 'short_description', 'category_list',
       'category_groups_list', 'num_funding_rounds', 'total_funding_usd',
       'total_funding', 'total_funding_currency_code', 'founded_on',
       'last_funding_on', 'closed_on', 'employee_count', 'email', 'phone',
       'facebook_url', 'linkedin_url', 'twitter_url', 'logo_url', 'alias1',
       'alias2', 'alias3', 'primary_role', 'num_exits'],
      dtype='object')

In [58]:
jobs_df_v4 = pd.merge(jobs_df_v3,
                      org_ori_df[['uuid','category_groups_list']],
                      left_on='org_uuid',
                      right_on='uuid',
                      how='left')
len(jobs_df_v4)

779743

In [59]:
founder_df_v1.head(3)

Unnamed: 0,org_uuid,job_uuid,job_name,people_uuid,person_name,org_name,started_on,ended_on,is_current,title,job_type,is_founder
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,4a470f5a-45fe-d892-f0e7-515069f107f6,"Greg Raleigh Founder, CEO and President @ Airg...",31dabae0-f52d-7a27-4f76-51f4d4dfa04c,Greg Raleigh,Airgo Networks,2001-01-01,2006-01-01,False,"Founder, CEO and President",executive,1.0
1,9c8adac6-5c8e-9344-b763-6beab966c63c,,,,,,,,,,,
2,6749cc07-efed-ff09-4efe-43faf6f580de,3759384a-da72-f38a-02dc-3d28e10f6c4b,Andrey Zarur Founder & CEO @ BioProcessors,d5e595c1-9ccd-4f89-8c9b-c8cc3c17076d,Andrey Zarur,BioProcessors,2002-01-01,2006-12-01,False,Founder & CEO,executive,1.0


In [60]:
len(founder_df_v1)

47496

In [61]:
founder_df_v1 = founder_df_v1 [['org_uuid','job_uuid','people_uuid']]

In [62]:
founder_df_v2 = pd.merge(founder_df_v1,
                         org_ori_df[['uuid','founded_on','category_groups_list']],
                         left_on='org_uuid',
                         right_on='uuid',how='left')

In [63]:
founder_df_v2['founded_on'] = pd.to_datetime(founder_df_v2['founded_on'])

In [64]:
founder_df_v2.head(3)

Unnamed: 0,org_uuid,job_uuid,people_uuid,uuid,founded_on,category_groups_list
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,4a470f5a-45fe-d892-f0e7-515069f107f6,31dabae0-f52d-7a27-4f76-51f4d4dfa04c,9dc17185-743d-5fbc-f03b-ad0c81c8795d,2000-05-01,"Hardware,Mobile,Software"
1,9c8adac6-5c8e-9344-b763-6beab966c63c,,,9c8adac6-5c8e-9344-b763-6beab966c63c,2000-10-01,"Information Technology,Other,Privacy and Security"
2,6749cc07-efed-ff09-4efe-43faf6f580de,3759384a-da72-f38a-02dc-3d28e10f6c4b,d5e595c1-9ccd-4f89-8c9b-c8cc3c17076d,6749cc07-efed-ff09-4efe-43faf6f580de,2000-10-01,"Biotechnology,Health Care,Science and Engineering"


In [65]:
#reduce the size of jobs_df by filtering to only relevant companies
jobs_df_v5 = pd.merge(founder_df_v2['people_uuid'],
                     jobs_df_v4,
                     on='people_uuid',
                     how='left')
len(jobs_df_v5)

86488

In [66]:
jobs_df_v5.head(3)

Unnamed: 0,people_uuid,job_uuid,org_uuid,started_on,ended_on,is_current,uuid,category_groups_list
0,31dabae0-f52d-7a27-4f76-51f4d4dfa04c,fc988918-44f5-d3c3-d221-7c396aa41bc2,d98fc401-0131-96ec-b5a3-392823d11d48,2008-11-01,NaT,False,d98fc401-0131-96ec-b5a3-392823d11d48,"Consumer Electronics,Hardware,Mobile,Software"
1,31dabae0-f52d-7a27-4f76-51f4d4dfa04c,2a70709e-d305-6e73-f7d6-8b005070625b,3e3e8a2f-3700-f2c9-6568-4aaae02c07c6,2006-01-01,2008-01-01,False,3e3e8a2f-3700-f2c9-6568-4aaae02c07c6,"Hardware,Manufacturing,Mobile,Software"
2,31dabae0-f52d-7a27-4f76-51f4d4dfa04c,11e77a70-da8a-7f66-f4ac-079caa2507c5,0f01774b-68ba-a23a-f743-a91f02d333c2,2008-01-01,NaT,True,0f01774b-68ba-a23a-f743-a91f02d333c2,"Financial Services,Health Care,Lending and Inv..."


In [67]:
#create a list of companies that being acquired and ipo
mask1 = org_ori_df['status'] == 'acquired'
mask2 = org_ori_df['status'] == 'ipo'
set_successful_companies = set (org_ori_df[mask1 | mask2]['uuid'])
len(set_successful_companies)

131425

In [68]:
%%time

#create list to capture founder experience in years, number for companies, previously successful, and workin in exactly same sector
founder_experience_num_years = list ()
founder_experience_num_companies = list()
founder_experience_num_successful = list ()
founder_experience_num_same_category = list ()

#get features from founders' experience
end = len(founder_df_v2)

for i in range (0,end):
    founder = founder_df_v2.iloc[i]
    founded_date = founder['founded_on']
    people_uuid = founder['people_uuid']
    org_uuid = founder['org_uuid']
    
    #create pd of jobs for one founder and filter
    founder_experience_df = jobs_df_v5[jobs_df_v5['people_uuid'] == people_uuid] 
    founder_experience_df_v1 = founder_experience_df[founder_experience_df['org_uuid'] != org_uuid] #drop the company founded
    founder_experience_df_v2 = founder_experience_df_v1[founder_experience_df_v1['started_on'] < founded_date] #filter to experience only before found compan
    
    #no experience found after filtering. append 0 for all features
    if len(founder_experience_df_v2) == 0: 
        founder_experience_num_years.append(0)
        founder_experience_num_companies.append(0)
        founder_experience_num_successful.append(0)
        founder_experience_num_same_category.append(0)
        
    else:
            
        #compute years of experience
        num_years = round ((founded_date - founder_experience_df_v2['started_on'].min()).days/365,1) 
        founder_experience_num_years.append(num_years) #append years of experience to main list

        #count the number of companies
        num_companies = len(list(founder_experience_df_v2['org_uuid'].unique())) #count the number of companies by looking at unique org_uuid
        founder_experience_num_companies.append(num_companies) #append to main list
            
        #count the number of company/ies previously ipoed or acquired
        set_companies_work_uuid = set(founder_experience_df_v2['org_uuid'].unique())
        num_successful_companies = len (set_companies_work_uuid.intersection(set_successful_companies))
        founder_experience_num_successful.append(num_successful_companies)
                
        #count number of sector that overlapped with company found
        founder_experience_df_v3 = founder_experience_df_v2[founder_experience_df_v2['category_groups_list'].notna()] #filter out company with no category groups list
        set_category_founded = set(founder['category_groups_list'].split(','))
        set_category_experience = set()
        for group_list in list(founder_experience_df_v3['category_groups_list']):
            group_list = group_list.split(',')
            for group in group_list:
                set_category_experience.add(group)
        
        #assign the score based on number of categories overlapped between company founded and experience
        founder_experience_num_same_category.append(len(set_category_experience.intersection(set_category_founded)))

Wall time: 5min 59s


In [69]:
len(founder_df_v2)

47496

In [70]:
founder_experience_num_years.count(0)

38460

In [71]:
founder_experience_num_companies.count(0)

38431

In [72]:
founder_experience_num_successful.count(0)

42343

In [73]:
founder_experience_num_same_category.count(0)

40862

In [74]:
#assign founder experience to founder
founder_df_v2['founder_experience_num_years'] = founder_experience_num_years
founder_df_v2['founder_experience_num_companies'] = founder_experience_num_companies
founder_df_v2['founder_experience_num_successful'] = founder_experience_num_successful
founder_df_v2['founder_experience_num_same_category'] = founder_experience_num_same_category

In [75]:
founder_df_v2.head(3)

Unnamed: 0,org_uuid,job_uuid,people_uuid,uuid,founded_on,category_groups_list,founder_experience_num_years,founder_experience_num_companies,founder_experience_num_successful,founder_experience_num_same_category
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,4a470f5a-45fe-d892-f0e7-515069f107f6,31dabae0-f52d-7a27-4f76-51f4d4dfa04c,9dc17185-743d-5fbc-f03b-ad0c81c8795d,2000-05-01,"Hardware,Mobile,Software",16.3,3,2,3
1,9c8adac6-5c8e-9344-b763-6beab966c63c,,,9c8adac6-5c8e-9344-b763-6beab966c63c,2000-10-01,"Information Technology,Other,Privacy and Security",0.0,0,0,0
2,6749cc07-efed-ff09-4efe-43faf6f580de,3759384a-da72-f38a-02dc-3d28e10f6c4b,d5e595c1-9ccd-4f89-8c9b-c8cc3c17076d,6749cc07-efed-ff09-4efe-43faf6f580de,2000-10-01,"Biotechnology,Health Care,Science and Engineering",0.0,0,0,0


In [76]:
founder_df_v3 = founder_df_v2[['org_uuid','founder_experience_num_years','founder_experience_num_companies',
                              'founder_experience_num_successful','founder_experience_num_same_category']].groupby('org_uuid').sum()

In [77]:
founder_df_v3.head(3)

Unnamed: 0_level_0,founder_experience_num_years,founder_experience_num_companies,founder_experience_num_successful,founder_experience_num_same_category
org_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00002470-bff7-6226-5800-0ca1b3787b6f,9.6,2,0,3
0000d497-c93a-eea3-eeb0-a943dfb4f71e,0.0,0,0,0
00040dc9-f822-267c-9cf8-1c9b92ca0588,7.6,3,2,1


In [78]:
founder_df_v3.reset_index(inplace=True)

In [79]:
main_processing_df_v5 = pd.merge(main_processing_df_v4,
                                founder_df_v3,
                                on='org_uuid',
                                how='left')
main_processing_df_v5.head(3)

Unnamed: 0,org_uuid,num_founder,is_founder_gender_diversity,num_bachelor,num_master,num_phd,num_mba,num_stem,num_ivy_league,num_top_100,num_advisor,founder_experience_num_years,founder_experience_num_companies,founder_experience_num_successful,founder_experience_num_same_category
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.3,3,2,3
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [80]:
len(main_processing_df_v5)

28727

# Save final file

In [81]:
len(main_processing_df_v5)

28727

In [82]:
main_processing_df_v5.head(3)

Unnamed: 0,org_uuid,num_founder,is_founder_gender_diversity,num_bachelor,num_master,num_phd,num_mba,num_stem,num_ivy_league,num_top_100,num_advisor,founder_experience_num_years,founder_experience_num_companies,founder_experience_num_successful,founder_experience_num_same_category
0,9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.3,3,2,3
1,9c8adac6-5c8e-9344-b763-6beab966c63c,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,6749cc07-efed-ff09-4efe-43faf6f580de,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


# Rename Column and Save File

In [83]:
main_processing_df_v5.set_index('org_uuid',inplace=True)

In [84]:
new_column_name = list()
for column in list(main_processing_df_v5.columns):
    new_column_name.append('people_' + column)
new_column_dict = dict(zip(main_processing_df_v5.columns,new_column_name))
main_processing_df_v5.rename(columns = new_column_dict,inplace=True)
main_processing_df_v5.head(3)

Unnamed: 0_level_0,people_num_founder,people_is_founder_gender_diversity,people_num_bachelor,people_num_master,people_num_phd,people_num_mba,people_num_stem,people_num_ivy_league,people_num_top_100,people_num_advisor,people_founder_experience_num_years,people_founder_experience_num_companies,people_founder_experience_num_successful,people_founder_experience_num_same_category
org_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
9dc17185-743d-5fbc-f03b-ad0c81c8795d,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.3,3,2,3
9c8adac6-5c8e-9344-b763-6beab966c63c,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
6749cc07-efed-ff09-4efe-43faf6f580de,1.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [87]:
list(main_processing_df_v5.columns)

['people_num_founder',
 'people_is_founder_gender_diversity',
 'people_num_bachelor',
 'people_num_master',
 'people_num_phd',
 'people_num_mba',
 'people_num_stem',
 'people_num_ivy_league',
 'people_num_top_100',
 'people_num_advisor',
 'people_founder_experience_num_years',
 'people_founder_experience_num_companies',
 'people_founder_experience_num_successful',
 'people_founder_experience_num_same_category']

In [88]:
len(list(main_processing_df_v5))

14

In [85]:
main_processing_df_v5.to_csv(r'd:\msc-project\data\final\features_people.csv',encoding='utf=8')