In [1]:
#Import basic packages
import os
import numpy as np
import pandas as pd
import csv
import regex as re
import datetime as dt 

In [2]:
#import outcome file with org_uuid
main_df = pd.read_csv(R"d:/msc-project/data/final/outcome_final.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['founded_on','seed_date','series_a_date','series_b_date','series_c_date'])
main_processing_df = main_df[['org_uuid','name','outcome']]

In [3]:
main_processing_df.head(3)

Unnamed: 0,org_uuid,name,outcome
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,0
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,0
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,0


In [4]:
len(main_processing_df)

142245

In [5]:
#import jobs file
jobs_df = pd.read_csv(R"d:/msc-project/data/pre-processed/jobs_preprocessed.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['started_on','ended_on'])
jobs_df.head(3)

Unnamed: 0,job_uuid,job_name,people_uuid,person_name,org_uuid,org_name,started_on,ended_on,is_current,title,job_type,is_founder
0,697b6934-fc1f-9d63-cfb2-1a10759b378e,Ben Elowitz Co-Founder/CEO/Board of Directors ...,ed13cd36-fe2b-3707-197b-0c2d56e37a71,Ben Elowitz,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,Co-Founder/CEO/Board of Directors,executive,1
1,b1de3765-442e-b556-9304-551c2a055901,Kevin Flaherty VP Marketing @ Wetpaint,5ceca97b-493c-1446-6249-5aaa33464763,Kevin Flaherty,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,VP Marketing,executive,0
2,1319cd30-f5e8-c700-0af6-64029c6f7124,Raju Vegesna Chief Evangelist @ Zoho,9f99a98a-aa97-b30b-0d36-db67c1d277e0,Raju Vegesna,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2000-11-01,,True,Chief Evangelist,employee,0


# Assign Number of founder for each organisation

In [6]:
num_founder = pd.DataFrame(jobs_df.groupby('org_uuid')['is_founder'].sum())
main_processing_df=pd.merge(main_processing_df,
                         num_founder,
                         on='org_uuid',
                         how='left'
                         )
main_processing_df.rename(columns = {'is_founder':'num_founder'},inplace=True)
main_processing_df.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,0,4.0
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,0,2.0
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,0,


In [7]:
#drop companies with no founder info
main_processing_df.dropna(subset=['num_founder'],inplace=True)
main_processing_df = main_processing_df[main_processing_df['num_founder'] > 0]

In [8]:
len(main_processing_df)

78693

In [9]:
main_processing_df.head(5)

Unnamed: 0,org_uuid,name,outcome,num_founder
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,0,4.0
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,0,2.0
3,0000d497-c93a-eea3-eeb0-a943dfb4f71e,AutoOffer,0,1.0
5,0001eae7-077d-4d0b-a717-f67bcf2a09fa,Workspace Property Trust,0,2.0
6,0002aa63-f21b-4c54-9495-895a1d09e0d4,86 Repairs,0,2.0


# Assign Gender Diversity Features

In [10]:
founder_df = jobs_df[jobs_df['is_founder'] == 1]
founder_gb = founder_df.groupby('org_uuid')

In [11]:
people_df = pd.read_csv(R"d:/msc-project/data/pre-processed/people_preprocessed.csv",encoding='utf-8',
                         index_col=False)

In [12]:
people_df.columns

Index(['people_uuid', 'people_name', 'first_name', 'last_name', 'gender',
       'country_code', 'state_code', 'region', 'city', 'org_uuid',
       'featured_job_organization_name', 'featured_job_title', 'facebook_url',
       'linkedin_url', 'twitter_url', 'logo_url', 'description'],
      dtype='object')

In [13]:
founder_gender_df = pd.merge(founder_df[['people_uuid','org_uuid']],
                             people_df[['people_uuid','gender']],
                             on='people_uuid',
                             how='left')

In [14]:
founder_gender_gb = founder_gender_df.groupby('org_uuid')

In [15]:
gender_diversity = list()

for org_uuid in list(main_processing_df['org_uuid']):
    founders_gender = list(founder_gender_gb.get_group(org_uuid)['gender']) #create list of founder's gender for one organisation
    if 'male' in founders_gender and 'female' in founders_gender: #assign 1 if male and female in gender list
        gender_diversity.append(1)
    else:
        gender_diversity.append(0)
        
main_processing_df['gender_diversity'] = gender_diversity

In [16]:
main_processing_df.shape

(78693, 5)

In [17]:
main_processing_df.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder,gender_diversity
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,0,4.0,1
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,0,2.0,0
3,0000d497-c93a-eea3-eeb0-a943dfb4f71e,AutoOffer,0,1.0,0


# Assigning degree type, ivy league, and top100 to organisation based on founder

In [18]:
#import processed degree file
deg_df = pd.read_csv(R"d:/msc-project/data/pre-processed/degrees_preprocessed.csv",encoding='utf-8',
                         index_col=False)
deg_df.head(3)

Unnamed: 0,deg_uuid,deg_name,people_uuid,person_name,institution_uuid,institution_name,degree_type,subject,started_on,completed_on,is_completed,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100
0,205fdfd1-ecac-aa43-262f-219f11755f67,MS Mass Communication @ Boston University,4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,1eab62d2-15d9-0db7-930f-2aa77d4688e1,Boston University,MS,Mass Communication,,1992-01-01,True,0,1,0,0,1,0,1
1,1a2ac288-eb99-3318-fde5-1517bc168f51,"BA English, French @ Washington University in...",4897dba9-3141-ecc0-2c4b-c9d844e6440f,John Green,6ae9957a-8fb4-0ab1-73fa-dd547c4d3da4,Washington University in St. Louis,BA,"English, French",,1990-01-01,True,1,0,0,0,0,0,0
2,b978d338-7ccc-7469-5ce7-ef98c34155ad,MS Internet Technology @ University of Greenwich,7d187b77-94f7-e6cc-6981-d7468db5968f,Sridhar Gundaiah,b5ea73f6-12a3-576d-ae9b-f4169147f974,University of Greenwich,MS,Internet Technology,,2006-01-01,True,0,1,0,0,1,0,0


In [19]:
#merge degree with founder_df
deg_type_df = deg_df[['people_uuid','is_bachelor','is_master','is_phd','is_mba','is_stem','is_ivy_league','is_top_100']]
founder_degree_df = pd.merge(founder_df[['people_uuid','org_uuid']],
                             deg_type_df,
                             on='people_uuid',
                             how='left')
founder_degree_df.fillna(value=0,inplace=True)
len(founder_degree_df)

355966

In [20]:
founder_degree_df.head(5)

Unnamed: 0,people_uuid,org_uuid,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100
0,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,ed13cd36-fe2b-3707-197b-0c2d56e37a71,e1393508-30ea-8a36-3f96-dd3226033abd,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,a01b8d46-d311-3333-7c34-aa3ae9c03f22,df662812-7f97-0b43-9d3e-12f64f504fbb,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,084aaa07-0795-1fe8-9c46-98bbeb02cd64,df662812-7f97-0b43-9d3e-12f64f504fbb,1.0,0.0,0.0,0.0,0.0,1.0,1.0
4,5ac8203a-540a-ab6c-46ee-84463834fe72,df662812-7f97-0b43-9d3e-12f64f504fbb,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
#group by org_uuid and merge with main file
founder_degree_df = founder_degree_df[['org_uuid','is_bachelor','is_master','is_phd','is_mba','is_stem',
                                      'is_ivy_league','is_top_100']].groupby('org_uuid').max()

main_processing_df = pd.merge(main_processing_df,
                             founder_degree_df,
                             on='org_uuid',
                             how = 'left')

In [22]:
#assign 0 to founder with no degree information
main_processing_df.isnull().sum(axis=0)

org_uuid            0
name                0
outcome             0
num_founder         0
gender_diversity    0
is_bachelor         0
is_master           0
is_phd              0
is_mba              0
is_stem             0
is_ivy_league       0
is_top_100          0
dtype: int64

In [23]:
main_processing_df.head(10)

Unnamed: 0,org_uuid,name,outcome,num_founder,gender_diversity,is_bachelor,is_master,is_phd,is_mba,is_stem,is_ivy_league,is_top_100
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,0,4.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,0,2.0,0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
2,0000d497-c93a-eea3-eeb0-a943dfb4f71e,AutoOffer,0,1.0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0001eae7-077d-4d0b-a717-f67bcf2a09fa,Workspace Property Trust,0,2.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0002aa63-f21b-4c54-9495-895a1d09e0d4,86 Repairs,0,2.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,00040dc9-f822-267c-9cf8-1c9b92ca0588,SmartRx,0,2.0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
6,00053977-b378-94a9-3735-d364a7e8d54d,myWebRoom,0,1.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,000607fc-cea0-535c-6324-e83ba07c8cc7,BioVigilant Systems,1,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0008f4ac-be62-b6e6-0122-f0c2da7fd1eb,AgShift,0,1.0,0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
9,000abddd-ed8c-479a-be76-149996d277a3,Black Brane Systems,0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
