In [1]:
#Import basic packages
import os
import numpy as np
import pandas as pd
import csv
import regex as re
import datetime as dt 

In [2]:
#import outcome file with org_uuid
main_df = pd.read_csv(R"d:/msc-project/data/final/outcome_final_v1.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['founded_on','seed_date','series_a_date','series_b_date','series_c_date'])
main_processing_df = main_df[['org_uuid','name','outcome']]

In [3]:
main_processing_df.head(3)

Unnamed: 0,org_uuid,name,outcome
0,eb37f7c8-c036-a915-bb7c-63f445330f66,Ceyba,0
1,0a88fa0e-7abe-c4ac-ab69-877b01a9cb60,IceFyre Semiconductor,0
2,de462c42-b0a5-c5af-9637-ec426b4e991f,ORMvision,0


In [4]:
len(main_processing_df)

32632

In [5]:
#import jobs file
jobs_df = pd.read_csv(R"d:/msc-project/data/pre-processed/jobs_preprocessed.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['started_on','ended_on'])
jobs_df.head(3)

Unnamed: 0,job_uuid,job_name,people_uuid,person_name,org_uuid,org_name,started_on,ended_on,is_current,title,job_type,is_founder
0,697b6934-fc1f-9d63-cfb2-1a10759b378e,Ben Elowitz Co-Founder/CEO/Board of Directors ...,ed13cd36-fe2b-3707-197b-0c2d56e37a71,Ben Elowitz,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,Co-Founder/CEO/Board of Directors,executive,1
1,b1de3765-442e-b556-9304-551c2a055901,Kevin Flaherty VP Marketing @ Wetpaint,5ceca97b-493c-1446-6249-5aaa33464763,Kevin Flaherty,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,VP Marketing,executive,0
2,1319cd30-f5e8-c700-0af6-64029c6f7124,Raju Vegesna Chief Evangelist @ Zoho,9f99a98a-aa97-b30b-0d36-db67c1d277e0,Raju Vegesna,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2000-11-01,,True,Chief Evangelist,employee,0


# Assign Number of founder for each organisation

In [6]:
num_founder = pd.DataFrame(jobs_df.groupby('org_uuid')['is_founder'].sum())
main_processing_df_v1 = pd.merge(main_processing_df,
                         num_founder,
                         on='org_uuid',
                         how='left'
                         )
main_processing_df_v1.rename(columns = {'is_founder':'num_founder'},inplace=True)
main_processing_df_v1.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder
0,eb37f7c8-c036-a915-bb7c-63f445330f66,Ceyba,0,0.0
1,0a88fa0e-7abe-c4ac-ab69-877b01a9cb60,IceFyre Semiconductor,0,1.0
2,de462c42-b0a5-c5af-9637-ec426b4e991f,ORMvision,0,1.0


In [7]:
main_processing_df_v1[main_processing_df_v1['num_founder'].isna()]

Unnamed: 0,org_uuid,name,outcome,num_founder
6,88206b5f-eefe-e06c-5a84-0dedd98dd529,"Sorrent, Inc",0,
12,722fd5dc-f201-fe36-551e-09895bba7d6e,Prominence Networks,0,
19,e4d0dca6-64e0-a455-fb34-4d86e9099daa,MarketRange,0,
22,2a2ddece-2eb4-96b7-8cce-03cd7facb61d,Improvista Interactive Music,0,
31,0a4f84dd-3f55-0846-ee4e-ed8eceeb43ef,You Software,0,
...,...,...,...,...
32625,4bc7e1e8-a99b-464f-928f-0c9a4af898a2,Joydream Technology,0,
32626,1be54494-1a8d-4a69-b6f8-89d17df50ff2,WUDENGDAI.COM,0,
32627,2d84ad66-77db-4cff-b0bd-33dc18ebe000,Shenzhen Litra Technology,0,
32630,58fc9fa4-09a9-4498-8d1a-170cb8107010,Xianchang Shidai,0,


In [8]:
#drop companies with no founder info
main_processing_df_v1.dropna(subset=['num_founder'],inplace=True)
main_processing_df_v2 = main_processing_df_v1[main_processing_df_v1['num_founder'] > 0]

In [9]:
len(main_processing_df_v2)

23036

In [10]:
main_processing_df_v2.head(5)

Unnamed: 0,org_uuid,name,outcome,num_founder
1,0a88fa0e-7abe-c4ac-ab69-877b01a9cb60,IceFyre Semiconductor,0,1.0
2,de462c42-b0a5-c5af-9637-ec426b4e991f,ORMvision,0,1.0
5,1b923b7c-b415-a83c-cbcc-53b27c877907,NanoInk,0,1.0
7,91f5ca18-e026-9482-8e4f-22593943e0f5,Plaxo,0,3.0
8,3d607a21-39bb-651e-c53f-9890ee334f18,Ygnition Networks,0,1.0


In [42]:
main_processing_df_v2['outcome'].value_counts()

0    15846
1     7190
Name: outcome, dtype: int64

# Assign Gender Diversity Features

In [11]:
founder_df = jobs_df[jobs_df['is_founder'] == 1]
founder_gb = founder_df.groupby('org_uuid')

In [12]:
people_df = pd.read_csv(R"d:/msc-project/data/pre-processed/people_preprocessed.csv",encoding='utf-8',
                         index_col=False)

In [13]:
people_df.columns

Index(['people_uuid', 'people_name', 'first_name', 'last_name', 'gender',
       'country_code', 'state_code', 'region', 'city', 'org_uuid',
       'featured_job_organization_name', 'featured_job_title', 'facebook_url',
       'linkedin_url', 'twitter_url', 'logo_url', 'description'],
      dtype='object')

In [14]:
founder_gender_df = pd.merge(founder_df[['people_uuid','org_uuid']],
                             people_df[['people_uuid','gender']],
                             on='people_uuid',
                             how='left')

In [15]:
founder_gender_df.head(3)

Unnamed: 0,people_uuid,org_uuid,gender
0,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,male
1,a01b8d46-d311-3333-7c34-aa3ae9c03f22,df662812-7f97-0b43-9d3e-12f64f504fbb,male
2,084aaa07-0795-1fe8-9c46-98bbeb02cd64,df662812-7f97-0b43-9d3e-12f64f504fbb,male


In [16]:
founder_gender_gb = founder_gender_df.groupby('org_uuid')

In [17]:
is_founder_gender_diversity = list()

for org_uuid in list(main_processing_df_v2['org_uuid']):
    founders_gender = list(founder_gender_gb.get_group(org_uuid)['gender']) #create list of founder's gender for one organisation
    if 'male' in founders_gender and 'female' in founders_gender: #assign 1 if male and female in gender list
        is_founder_gender_diversity.append(1)
    else:
        is_founder_gender_diversity.append(0)

main_processing_df_v2['is_founder_gender_diversity'] = is_founder_gender_diversity

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [18]:
len(main_processing_df_v2)

23036

In [19]:
main_processing_df_v2.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder,is_founder_gender_diversity
1,0a88fa0e-7abe-c4ac-ab69-877b01a9cb60,IceFyre Semiconductor,0,1.0,0
2,de462c42-b0a5-c5af-9637-ec426b4e991f,ORMvision,0,1.0,0
5,1b923b7c-b415-a83c-cbcc-53b27c877907,NanoInk,0,1.0,0


# Assigning degree type, ivy league, and top100 to organisation based on founder

In [20]:
#import processed degree file
deg_df = pd.read_csv(R"d:/msc-project/data/pre-processed/degrees_preprocessed.csv",encoding='utf-8',
                         index_col=False)
deg_df.head(3)

Unnamed: 0,deg_uuid,deg_name,people_uuid,person_name,institution_uuid,institution_name,degree_type,subject,started_on,completed_on,is_completed,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100
0,205fdfd1-ecac-aa43-262f-219f11755f67,MS Mass Communication @ Boston University,4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,1eab62d2-15d9-0db7-930f-2aa77d4688e1,Boston University,MS,Mass Communication,,1992-01-01,True,0,1,0,0,1,0,1
1,1a2ac288-eb99-3318-fde5-1517bc168f51,"BA English, French @ Washington University in...",4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,6ae9957a-8fb4-0ab1-73fa-dd547c4d3da4,Washington University in St. Louis,BA,"English, French",,1990-01-01,True,1,0,0,0,0,0,0
2,b978d338-7ccc-7469-5ce7-ef98c34155ad,MS Internet Technology @ University of Greenwich,7d187b77-94f7-e6cc-6981-d7468db5968f,Sridhar Gundaiah,b5ea73f6-12a3-576d-ae9b-f4169147f974,University of Greenwich,MS,Internet Technology,,2006-01-01,True,0,1,0,0,1,0,0


In [21]:
#merge degree with founder_df
deg_type_df = deg_df[['people_uuid','is_bachelor','is_master','is_phd','is_mba','is_stem','is_ivy_league','is_top_100']]
founder_degree_df = pd.merge(founder_df[['people_uuid','org_uuid']],
                             deg_type_df,
                             on='people_uuid',
                             how='left')
founder_degree_df.fillna(value=0,inplace=True)
len(founder_degree_df)

355966

In [22]:
founder_degree_df.head(3)

Unnamed: 0,people_uuid,org_uuid,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100
0,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,a01b8d46-d311-3333-7c34-aa3ae9c03f22,df662812-7f97-0b43-9d3e-12f64f504fbb,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [23]:
#group by org_uuid and merge with main file
founder_degree_df = founder_degree_df[['org_uuid','is_bachelor','is_master','is_phd','is_mba','is_stem',
                                      'is_ivy_league','is_top_100']].groupby('org_uuid').max()

main_processing_df_v3 = pd.merge(main_processing_df_v2,
                                 founder_degree_df,
                                 on='org_uuid',
                                 how = 'left')

In [24]:
#assign 0 to founder with no degree information
main_processing_df_v3.isnull().sum(axis=0)

org_uuid                       0
name                           0
outcome                        0
num_founder                    0
is_founder_gender_diversity    0
is_bachelor                    0
is_master                      0
is_phd                         0
is_mba                         0
is_stem                        0
is_ivy_league                  0
is_top_100                     0
dtype: int64

In [25]:
main_processing_df_v3.head(10)

Unnamed: 0,org_uuid,name,outcome,num_founder,is_founder_gender_diversity,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100
0,0a88fa0e-7abe-c4ac-ab69-877b01a9cb60,IceFyre Semiconductor,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,de462c42-b0a5-c5af-9637-ec426b4e991f,ORMvision,0,1.0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1b923b7c-b415-a83c-cbcc-53b27c877907,NanoInk,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,91f5ca18-e026-9482-8e4f-22593943e0f5,Plaxo,0,3.0,0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
4,3d607a21-39bb-651e-c53f-9890ee334f18,Ygnition Networks,0,1.0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,2a5fa1db-4075-04fb-d76c-372158a3a2c8,Friendster,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0f19649c-4fb9-cc37-d9b9-753d738c9da7,Reactrix,0,1.0,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
7,7635f325-b7c2-8d0d-b428-df100ae21e8b,VaxInnate,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,229afe91-4a19-4f02-86a5-88a948c350a6,Sitoa,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,ec74826e-c72e-a898-d676-f710724dd73d,BitPass,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Assign number of advisor

In [26]:
jobs_df['job_type'].value_counts()

executive         967719
employee          395731
board_member      154586
advisor            57657
board_observer      5948
Name: job_type, dtype: int64

In [27]:
jobs_advisor_df = jobs_df[jobs_df['job_type'] == 'advisor']

In [28]:
jobs_advisor_df.head(3)

Unnamed: 0,job_uuid,job_name,people_uuid,person_name,org_uuid,org_name,started_on,ended_on,is_current,title,job_type,is_founder
247,b8902477-3f1d-4147-5bca-f053b3491b75,Brad Gerstner Investor / Advisor / Board Membe...,46daadf8-9ab9-1f94-24e2-f83ad6d4ad7f,Brad Gerstner,5c6f4d9e-dc72-e018-ee21-71cd23f698ad,Farecast,2006-01-01,2008-01-01,False,Investor / Advisor / Board Member,advisor,0
353,8986d170-77a9-a918-fe22-b8de983d1e42,Scott Rafer Advisor @ Dogster,eaadb11b-7423-0d5d-0a9d-7bf93eb64579,Scott Rafer,b683af1f-8dcf-8f82-37d9-88fbadd63e1b,Dogster,,,True,Advisor,advisor,0
444,70e011ab-472a-9f50-77fe-718af9ae9fe6,George Martin Advisor @ iLike,651ba47c-9aef-f811-a39d-d917677aded4,George Martin,3091d3e8-7934-1e90-7c82-7f1c3caf53cb,iLike,,,False,Advisor,advisor,0


In [29]:
advisor_gb = jobs_advisor_df.groupby('org_uuid')

In [30]:
num_advisor = advisor_gb.size()
num_advisor = pd.DataFrame(num_advisor,
                          columns=['num_advisor'])

In [31]:
type(num_advisor)

pandas.core.frame.DataFrame

In [32]:
main_processing_df_v4 = pd.merge(main_processing_df_v3,
                                 num_advisor,
                                 on='org_uuid',
                                 how='left')

In [33]:
main_processing_df_v4.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder,is_founder_gender_diversity,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100,num_advisor
0,0a88fa0e-7abe-c4ac-ab69-877b01a9cb60,IceFyre Semiconductor,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,de462c42-b0a5-c5af-9637-ec426b4e991f,ORMvision,0,1.0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,
2,1b923b7c-b415-a83c-cbcc-53b27c877907,NanoInk,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [34]:
main_processing_df_v4['num_advisor'].fillna(0,inplace=True)

In [35]:
main_processing_df_v4.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder,is_founder_gender_diversity,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100,num_advisor
0,0a88fa0e-7abe-c4ac-ab69-877b01a9cb60,IceFyre Semiconductor,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,de462c42-b0a5-c5af-9637-ec426b4e991f,ORMvision,0,1.0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1b923b7c-b415-a83c-cbcc-53b27c877907,NanoInk,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Board Composition

In [36]:
jobs_board_df = jobs_df[jobs_df['job_type'] == 'board_member']

In [37]:
jobs_board_df.head(3)

Unnamed: 0,job_uuid,job_name,people_uuid,person_name,org_uuid,org_name,started_on,ended_on,is_current,title,job_type,is_founder
24,e2202c1c-9614-e4ce-4a3c-74fc63cde40d,Alan Braverman Member of the Board of Director...,b9ef118c-9258-b287-3107-59c6f3625134,Alan Braverman,4111dc8b-c0df-2d24-ed33-30cd137b3098,Geni,,,True,Member of the Board of Directors,board_member,0
28,e920a71a-1963-6aab-746b-f4b6150d0a1f,George Zachary Board Member @ Geni,7ee87f9d-8afa-4232-c209-a79907bf89b0,George Zachary,4111dc8b-c0df-2d24-ed33-30cd137b3098,Geni,,,False,Board Member,board_member,0
29,a60a5b13-ad44-75f1-5257-4e6e61ac1839,Peter Thiel Member of Board of Directors @ Geni,3f47be49-2e32-8118-01a0-31685a4d0fd7,Peter Thiel,4111dc8b-c0df-2d24-ed33-30cd137b3098,Geni,,,True,Member of Board of Directors,board_member,0


In [38]:
jobs_board_df=jobs_board_df[['people_uuid','org_uuid']]

In [39]:
jobs_board_df.head(3)

Unnamed: 0,people_uuid,org_uuid
24,b9ef118c-9258-b287-3107-59c6f3625134,4111dc8b-c0df-2d24-ed33-30cd137b3098
28,7ee87f9d-8afa-4232-c209-a79907bf89b0,4111dc8b-c0df-2d24-ed33-30cd137b3098
29,3f47be49-2e32-8118-01a0-31685a4d0fd7,4111dc8b-c0df-2d24-ed33-30cd137b3098


In [40]:
investors_df = pd.read_csv(R'd:\msc-project\data\pre-processed\investors_preprocessed.csv',encoding='utf-8')

In [41]:
investors_df.head(3)

Unnamed: 0,investor_uuid,name,type,country_code,state_code,region,city,investor_types,investment_count,total_funding_usd,total_funding,total_funding_currency_code,founded_on,closed_on,facebook_url,linkedin_url,twitter_url,logo_url
0,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,organization,USA,CA,California,Pleasanton,,6.0,,,,1996-09-15,,http://www.facebook.com/zoho,http://www.linkedin.com/company/zoho-corporati...,http://twitter.com/zoho,https://res.cloudinary.com/crunchbase-producti...
1,f4d5ab44-058b-298b-ea81-380e6e9a8eec,Omidyar Network,organization,USA,CA,California,Redwood City,family_investment_office,313.0,,,,2004-01-01,,http://www.facebook.com/OmidyarNetwork,http://www.linkedin.com/company/22806,http://twitter.com/OmidyarNetwork,https://res.cloudinary.com/crunchbase-producti...
2,df662812-7f97-0b43-9d3e-12f64f504fbb,Facebook,organization,USA,CA,California,Menlo Park,,32.0,16122820000.0,16122820000.0,USD,2004-02-04,,https://www.facebook.com/facebook/,http://www.linkedin.com/company/facebook,https://twitter.com/facebook,https://res.cloudinary.com/crunchbase-producti...
