In [1]:
#Import basic packages
import os
import numpy as np
import pandas as pd
import csv
import regex as re
import datetime as dt 

In [2]:
#import outcome file with org_uuid
main_df = pd.read_csv(R"d:/msc-project/data/final/outcome_final.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['founded_on','seed_date','series_a_date','series_b_date','series_c_date'])
main_processing_df = main_df[['org_uuid','name','outcome']]

In [3]:
main_processing_df.head(3)

Unnamed: 0,org_uuid,name,outcome
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,0
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,0
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,0


In [4]:
len(main_processing_df)

142245

In [5]:
#import jobs file
jobs_df = pd.read_csv(R"d:/msc-project/data/pre-processed/jobs_preprocessed.csv",encoding='utf-8',
                         index_col=False,
                         parse_dates= ['started_on','ended_on'])
jobs_df.head(3)

Unnamed: 0,job_uuid,job_name,people_uuid,person_name,org_uuid,org_name,started_on,ended_on,is_current,title,job_type,is_founder
0,697b6934-fc1f-9d63-cfb2-1a10759b378e,Ben Elowitz Co-Founder/CEO/Board of Directors ...,ed13cd36-fe2b-3707-197b-0c2d56e37a71,Ben Elowitz,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,Co-Founder/CEO/Board of Directors,executive,1
1,b1de3765-442e-b556-9304-551c2a055901,Kevin Flaherty VP Marketing @ Wetpaint,5ceca97b-493c-1446-6249-5aaa33464763,Kevin Flaherty,e1393508-30ea-8a36-3f96-dd3226033abd,Wetpaint,,,False,VP Marketing,executive,0
2,1319cd30-f5e8-c700-0af6-64029c6f7124,Raju Vegesna Chief Evangelist @ Zoho,9f99a98a-aa97-b30b-0d36-db67c1d277e0,Raju Vegesna,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,2000-11-01,,True,Chief Evangelist,employee,0


In [6]:
num_founder = pd.DataFrame(jobs_df.groupby('org_uuid')['is_founder'].sum())
main_processing_df=pd.merge(main_processing_df,
                         num_founder,
                         on='org_uuid',
                         how='left'
                         )
main_processing_df.rename(columns = {'is_founder':'num_founder'},inplace=True)
main_processing_df.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,0,4.0
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,0,2.0
2,000095de-8e2b-82f1-32a7-c222ba3d5682,Ultraprise Loan Technologies,0,


In [7]:
#drop companies with no founder info
main_processing_df.dropna(subset=['num_founder'],inplace=True)
main_processing_df = main_processing_df[main_processing_df['num_founder'] > 0]

In [8]:
len(main_processing_df)

78693

In [9]:
founder_df = jobs_df[jobs_df['is_founder'] == 1]
founder_gb = founder_df.groupby('org_uuid')

In [10]:
founder_gb.get_group('000014da-0c46-b9cb-0941-3a93c027b119')['people_uuid']

981847    18134829-3fc2-d700-a7f4-2dc95e8882a6
981850    a20b6e51-5f9b-4cb3-9942-d65f8acdcec5
981854    c7ca9967-869d-7775-37c4-a2d5c1569c46
981861    a9606b0f-fbb4-cd04-9c7d-663fd362ec5e
Name: people_uuid, dtype: object

In [11]:
people_df = pd.read_csv(R"d:/msc-project/data/pre-processed/people_preprocessed.csv",encoding='utf-8',
                         index_col=False)

In [12]:
people_df.columns

Index(['people_uuid', 'people_name', 'first_name', 'last_name', 'gender',
       'country_code', 'state_code', 'region', 'city', 'org_uuid',
       'featured_job_organization_name', 'featured_job_title', 'facebook_url',
       'linkedin_url', 'twitter_url', 'logo_url', 'description'],
      dtype='object')

In [13]:
founder_gender_df = pd.merge(founder_df[['people_uuid','org_uuid']],
                             people_df[['people_uuid','gender']],
                             on='people_uuid',
                             how='left')

In [14]:
founder_gender_gb = founder_gender_df.groupby('org_uuid')

In [15]:
gender_diversity = list()

for org_uuid in list(main_processing_df['org_uuid']):
    founders_gender = list(founder_gender_gb.get_group(org_uuid)['gender']) #create list of founder's gender for one organisation
    if 'male' in founders_gender and 'female' in founders_gender: #assign 1 if male and female in gender list
        gender_diversity.append(1)
    else:
        gender_diversity.append(0)
        
main_processing_df['gender_diversity'] = gender_diversity

In [16]:
main_processing_df.shape

(78693, 5)

In [17]:
main_processing_df.head(3)

Unnamed: 0,org_uuid,name,outcome,num_founder,gender_diversity
0,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,0,4.0,1
1,00002470-bff7-6226-5800-0ca1b3787b6f,Codementor,0,2.0,0
3,0000d497-c93a-eea3-eeb0-a943dfb4f71e,AutoOffer,0,1.0,0


In [19]:
people_df.cl

Unnamed: 0,people_uuid,people_name,first_name,last_name,gender,country_code,state_code,region,city,org_uuid,featured_job_organization_name,featured_job_title,facebook_url,linkedin_url,twitter_url,logo_url,description
0,ed13cd36-fe2b-3707-197b-0c2d56e37a71,Ben Elowitz,Ben,Elowitz,male,USA,WA,Washington,Seattle,cf253887-5eac-21a2-28d3-47db7311f7e9,Madrona Venture Group,Managing Director,http://www.facebook.com/elowitz,https://www.linkedin.com/in/elowitz/,http://twitter.com/elowitz,https://res.cloudinary.com/crunchbase-producti...,Ben Elowitz is co-founder and CEO of [Wetpaint...
1,5ceca97b-493c-1446-6249-5aaa33464763,Kevin Flaherty,Kevin,Flaherty,male,,,,,789e5e4d-0c90-d06e-92a0-b800b461c3da,DRSmedia,Team Member,,http://www.linkedin.com/in/kevinflaherty,https://twitter.com/tallkp,https://res.cloudinary.com/crunchbase-producti...,"Brand development, creative agency management,..."
2,9f99a98a-aa97-b30b-0d36-db67c1d277e0,Raju Vegesna,Raju,Vegesna,male,USA,CA,California,San Francisco,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,Chief Evangelist,,https://www.linkedin.com/pub/raju-vegesna/1/65...,,https://res.cloudinary.com/crunchbase-producti...,Raju is an evangelist for Zoho and is one of t...
3,6e1bca72-a865-b518-b305-31214ce2d1b0,Ian Wenig,Ian,Wenig,male,,,,,bf4d7b0e-b34d-2fd8-d292-6049c4f7efc7,Zoho,VP Business Development,,http://www.linkedin.com/profile/view?id=1633445,,https://res.cloudinary.com/crunchbase-producti...,Ian Wenig has more then twenty years experienc...
4,80d25c23-9726-9dda-5852-39cdf4810ea5,Ron Gorodetzky,Ron,Gorodetzky,male,,,,,aa3bf156-06af-5b6e-215d-9e7211fc173b,fflick,Co-Founder and CTO,,https://twitter.com/ronwinbeta,,https://res.cloudinary.com/crunchbase-producti...,In November 2004 Ron was asked to help adminis...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673347,7305a901-2b9e-4c39-a719-d46b958426cd,Fernando Bueno,Fernando,Bueno,male,BRA,,Sao Paulo,São Paulo,ef08454e-b5ee-421f-882b-f2bbaacccd4b,Huddle Brasil,Co-founder & CEO,,https://www.linkedin.com/in/fernando-bueno/,,,Fernando Bueno is the Co-founder and CEO at Hu...
673348,1ea4db7f-d933-4e6b-8634-6dee757f78c6,Alexandre Pedretti,Alexandre,Pedretti,male,BRA,,Sao Paulo,São Paulo,ef08454e-b5ee-421f-882b-f2bbaacccd4b,Huddle Brasil,COO,,https://www.linkedin.com/in/alexandre-pedretti...,,https://res.cloudinary.com/crunchbase-producti...,Alexandre Pedretti is the COO at Huddle Brasil.
673349,ba51b295-b5c7-4df7-92c2-8df26fabfdc8,Krzysztof Głodowski,Krzysztof,Głodowski,male,POL,,Mazowieckie,Warsaw,9bd364e6-900b-4447-9378-426671da9880,Jet Toast,Co-Founder,,https://www.linkedin.com/in/krzysztof-g%C5%82o...,,,Krzysztof Głodowski is a Co-Founder at Jet Toast.
673350,9a63608e-99be-4146-a310-d14ab1a14d45,Tobiasz Siemiński,Tobiasz,Siemiński,male,POL,,Mazowieckie,Warsaw,9bd364e6-900b-4447-9378-426671da9880,Jet Toast,Co-Founder and Developer,,https://www.linkedin.com/in/tobiasz-siemi%C5%8...,,https://res.cloudinary.com/crunchbase-producti...,Tobiasz Siemiński is a Co-Founder and Develope...
