In [7]:
'''
투자자 분석
1. 초기 투자를 잘 하는 작은 규모의 투자자를 찾아보자
2. Top10, 25 들이랑 같이 들어가는 투자자들을 찾아보자
'''

'\n\xed\x88\xac\xec\x9e\x90\xec\x9e\x90 \xeb\xb6\x84\xec\x84\x9d\n1. \xec\xb4\x88\xea\xb8\xb0 \xed\x88\xac\xec\x9e\x90\xeb\xa5\xbc \xec\x9e\x98 \xed\x95\x98\xeb\x8a\x94 \xec\x9e\x91\xec\x9d\x80 \xea\xb7\x9c\xeb\xaa\xa8\xec\x9d\x98 \xed\x88\xac\xec\x9e\x90\xec\x9e\x90\xeb\xa5\xbc \xec\xb0\xbe\xec\x95\x84\xeb\xb3\xb4\xec\x9e\x90\n2. Top10, 25 \xeb\x93\xa4\xec\x9d\xb4\xeb\x9e\x91 \xea\xb0\x99\xec\x9d\xb4 \xeb\x93\xa4\xec\x96\xb4\xea\xb0\x80\xeb\x8a\x94 \xed\x88\xac\xec\x9e\x90\xec\x9e\x90\xeb\x93\xa4\xec\x9d\x84 \xec\xb0\xbe\xec\x95\x84\xeb\xb3\xb4\xec\x9e\x90\n'

In [8]:
import pandas as pd
import re
import gensim
import pickle
import numpy as np
import operator
from datetime import datetime
from matplotlib import pyplot as plt
import networkx as nx
from copy import copy
%matplotlib inline

In [9]:
# load data
data_folder = '../data/csv_export/'
util_folder = '../util/'
df_organizations = pd.read_csv(data_folder + 'organizations.csv'.format(data_folder), dtype={'first_funding_on': str, 'last_funding_on':str})
df_description = pd.read_csv(data_folder + 'organization_descriptions.csv')
df_funding_rounds = pd.read_csv(data_folder + 'funding_rounds.csv')
df_funds = pd.read_csv(data_folder + 'funds.csv')
df_investments = pd.read_csv(data_folder + 'investments.csv')
df_acq = pd.read_csv(data_folder + 'acquisitions.csv')
df_people = pd.read_csv(data_folder + 'people.csv')
df_degrees = pd.read_csv(data_folder + 'degrees.csv')
df_investors = pd.read_csv(data_folder + 'investors.csv')

In [10]:
'''
1. 초기 투자를 잘 하는 투자자를 찾아보자
'''

# 잘된 회사를 찾기
def get_investor_scores(criteria = ['B', 'C', 'acquisition', 'ipo'], founded_after_this = '2007-01-01', asia_only = False):    
    # get all companies founded after the time
    df_organizations['founded_on'] = pd.to_datetime(df_organizations['founded_on'], errors='ignore')
    companies = df_organizations[df_organizations['founded_on'] >= founded_after_this]
    
    # 아시아 회사에 대해서만
    if asia_only:
        print('before {}'.format(len(companies)))
        asia = df_country_code[df_country_code['CC'] == 'AS']['a-3'].values
        companies = companies[companies['country_code'].isin(asia)]
        print('after {}'.format(len(companies)))
    companies = companies['uuid']
    print('There are {} companies founded after {}'.format(len(companies), founded_after_this))

    # get good companies with correct funding criteria        
    funding_criteria = []
    if 'A' in criteria: funding_criteria.append('A')
    if 'B' in criteria: funding_criteria.append('B')
    if 'C' in criteria: funding_criteria.append('C')        
    fundings = df_funding_rounds[df_funding_rounds['company_uuid'].isin(companies)]
    good_companies_1 = fundings[fundings['funding_round_code'].isin(criteria)]['company_uuid'].values
    good_companies_1 = list(set(good_companies_1))
    print('{} companies with fundings {}'.format(len(good_companies_1), funding_criteria))
    
    # Get list of acquired companies
    if 'acquisition' in criteria:        
        df_acq = pd.read_csv(data_folder + 'acquisitions.csv')
        df_acq = df_acq[df_acq['price_usd'] > 1000000]
        df_acq = df_acq[df_acq['acquiree_uuid'].isin(companies)]
        print('{} acquired for 1M+'.format(df_acq.shape[0], len(companies)))
        good_companies_2 = df_acq['acquiree_uuid'].values
        good_companies_2 = list(set(good_companies_2))
    
    # Get list of IPO companies
    if 'ipo' in criteria:
        df_ipos = pd.read_csv(data_folder + 'ipos.csv')
        df_ipos = df_ipos[df_ipos['company_uuid'].isin(companies)]
        print('{} IPOs'.format(df_ipos.shape[0]))
        good_companies_3 = df_ipos['company_uuid'].values
        good_companies_3 = list(set(good_companies_3))
    
    # combine all good companies
    good_companies = good_companies_1
    if 'acquisition' in criteria:
        good_companies.extend(good_companies_2)
    if 'acquisition' in criteria:
        good_companies.extend(good_companies_3)
    good_companies = list(set(good_companies))
    print('In total, {} good companies'.format(len(good_companies)))
    
    # good fundings
    all_seed = fundings[fundings['company_uuid'].isin(companies)]
    all_seed = all_seed[all_seed['funding_round_type'] == 'seed']['funding_round_uuid']
    all_seed = list(set(all_seed))

    good_seed = fundings[fundings['company_uuid'].isin(good_companies)]
    good_seed = good_seed[good_seed['funding_round_type'] == 'seed']['funding_round_uuid']
    good_seed = list(set(good_seed))

    print('# good companies: {} with {}\n# good / all seed investments: {} / {}'.format(len(good_companies), criteria, len(good_seed), len(all_seed)))

    # good investors
    investor_score_good = {}
    investor_score_all = {}
    investor_score = {}
    for row in df_investments.iterrows():
        funding_round_uuid = row[1][0]
        investor_uuid = row[1][1]
        if funding_round_uuid in all_seed:
            investor_score_all[investor_uuid] = investor_score_all.get(investor_uuid, 0) + 1
        if funding_round_uuid in good_seed:
            investor_score_good[investor_uuid] = investor_score_good.get(investor_uuid, 0) + 1
    return investor_score_good, investor_score_all

In [None]:
'''
초기 투자 잘 하는 투자자들 중 어떤 규모의 투자자들을 보고 싶은가?
'''

def check(investor_score_good, investor_score_all, MIN_SEED = 10):
    investor_score = {}
    for investor, all_count in investor_score_all.items():
        if all_count >= MIN_SEED:
            investor_score[investor]= investor_score_good.get(investor, 0) / float(all_count)

    investor_score = sorted(investor_score.items(), key=operator.itemgetter(1))
    investor_score.reverse()
    print('{} investors with minimum of {} seed investment').format(len(investor_score), MIN_SEED)
    for investor, score in investor_score[:20]:
        print('{:.3f}={}/{}\t{}'.format(score, investor_score_good[investor], investor_score_all[investor], df_investors[df_investors['uuid'] == investor]['investor_name'].values[0]))

In [None]:
criteria = ['B', 'C', 'acquisition', 'ipo']
founded_after_this = '2007-01-01'
global_investor_score_good, global_investor_score_all = get_investor_scores(criteria, founded_after_this, False)
asia_investor_score_good, asia_investor_score_all = get_investor_scores(criteria, founded_after_this, True)

There are 228472 companies founded after 2007-01-01
5558 companies with fundings ['B', 'C']
1301 acquired for 1M+
1423 IPOs
In total, 7801 good companies
# good companies: 7801 with ['B', 'C', 'acquisition', 'ipo']
# good / all seed investments: 2353 / 43526


In [None]:
# global 투자자, 최소 5개의 seed 투자
check(global_investor_score_good, global_investor_score_all, MIN_SEED = 5)

In [None]:
# global 투자자, 최소 10개의 seed 투자
check(global_investor_score_good, global_investor_score_all, MIN_SEED = 10)

In [None]:
# asia 투자자, 최소 3개의 seed 투자
check(asia_investor_score_good, asia_investor_score_all, MIN_SEED = 3)

In [None]:
# asia 투자자, 최소 5개의 seed 투자
check(asia_investor_score_good, asia_investor_score_all, MIN_SEED = 5)

In [None]:
# asia 투자자, 최소 10개의 seed 투자
check(asia_investor_score_good, asia_investor_score_all, MIN_SEED = 10)

In [None]:
'''
2. 유명한 투자자들과 인맥이 있을 것 같은 작은 투자자들을 살펴보자
'''

# get investors with high centrality
f = open(util_folder + 'investor_centrality_degree.pickle')
investors_high_central = pickle.load(f)
f.close()

In [None]:
# 유명한 회사들 찾기 1 (network centrality)
top25 = [uuid for uuid, value in investors_high_central[:25]]
top25_central = []
for i, uuid in enumerate(top25):
    print df_investors[df_investors['uuid'] == (uuid)]['investor_name'].values[0], investors_high_central[i][1]
    top25_central.append(uuid)

In [None]:
# 유명한 회사들 찾기 2. funding frequency
groups = df_investments.groupby('investor_uuid')['funding_round_uuid'].count().reset_index().sort_values(by = 'funding_round_uuid', ascending = False)#['investor_uuid'].apply(list)
count = 0
top25_funding = []
for index, row in groups.iterrows():
    uuid = row[0]
    fund_count = row[1]
    print df_investors[df_investors['uuid'] == (uuid)]['investor_name'].values[0], fund_count
    top25_funding.append(uuid)
    if count == 25: break
    count += 1

In [None]:
# 유명한 회사들과 공동 투자를 많이 한 회사를 찾아라
# 그 중 최소한 MIN 번 이상의 투자를 한 회사들
MIN = 3

# get all companies founded after the time
companies = df_organizations['uuid']
print('There are {} companies'.format(len(companies)))

# get famous investors
top_investors = top25_central
top_investors.extend(top25_funding)
top_investors = list(set(top_investors))

co_funding_count = {}
funding_count = {}
for investor in top_investors:
    co_funding_count[investor] = {}

groups = df_investments.groupby('funding_round_uuid')['investor_uuid'].apply(list)
# 같은 회사에 투자를 한 경우, 두 투자 회사 사이에 링크를 준다
for investors in groups.values:
    for investor1 in investors:
        for investor2 in investors:
            if investor1 != investor2:
                if investor1 in top_investors and investor2 not in top_investors:
                    co_funding_count[investor1][investor2] = co_funding_count[investor1].get(investor2, 0) + 1
                    funding_count[investor2] = funding_count.get(investor2, 0) + 1
                if investor2 in top_investors and investor1 not in top_investors:
                    co_funding_count[investor2][investor1] = co_funding_count[investor2].get(investor1, 0) + 1
                    funding_count[investor1] = funding_count.get(investor1, 0) + 1

In [None]:
co_funding_prop = {}
print('TOP VC List')
for investor1 in top_investors:
    print('{}'.format(df_investors[df_investors['uuid'] == (investor1)]['investor_name'].values[0]))
    
print('\n\n')
for investor1 in top_investors:
    co_funding_propop[investor1] = {}
    for investor2, count in funding_count.items():
        if count >= MIN:
            if investor2 in co_funding_count[investor1]:
                co_funding_prop[investor1][investor2] = co_funding_count[investor1][investor2] / float(count)
    # sort
    score = co_funding_prop[investor1]
    score = sorted(score.items(), key=operator.itemgetter(1))
    score.reverse()
    print('TOP VC {}'.format(df_investors[df_investors['uuid'] == (investor1)]['investor_name'].values[0]))
    for key, value in score[:10]:
        name = df_investors[df_investors['uuid'] == (key)]['investor_name'].values[0]
        print('{}% {}: {}'.format(value * 100, funding_count[key], name))
    print('\n\n')