In [217]:
# competitor analysis

In [218]:
import re, gensim, pickle, operator
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
from datetime import datetime
from matplotlib import pyplot as plt
%matplotlib inline

In [219]:
# load data
data_folder = '../data/csv_export/'
util_folder = '../util/'
trend_folder = '../data/trends/'
df_organizations = pd.read_csv(data_folder + 'organizations.csv'.format(data_folder), dtype={'first_funding_on': str, 'last_funding_on':str})
df_description = pd.read_csv(data_folder + 'organization_descriptions.csv')
df_funding_rounds = pd.read_csv(data_folder + 'funding_rounds.csv')
df_funds = pd.read_csv(data_folder + 'funds.csv')
df_investments = pd.read_csv(data_folder + 'investments.csv')
df_acq = pd.read_csv(data_folder + 'acquisitions.csv')
df_competition = pd.read_csv(data_folder + 'competitors.csv')
df_category = pd.read_csv(data_folder + 'category_groups.csv')
df_investors = pd.read_csv(data_folder + 'investors.csv')

In [220]:
# 각 카테고리에 속하는 회사들의 펀딩 찍어보기
# 회사-카테고리 매핑
df_category = df_organizations[['uuid','category_list']]
df2 = df_category['category_list'].str.split('|').apply(pd.Series, 1).stack()
df2.index = df2.index.droplevel(-1)
df2.name = 'category_list'
del df_category['category_list']
df_category = df_category.join(df2)
df_category.columns = ['uuid', 'category']
df_category.head()

Unnamed: 0,uuid,category
0,1e4f199c-363b-451b-a164-f94571075ee5,hardware
0,1e4f199c-363b-451b-a164-f94571075ee5,manufacturing
0,1e4f199c-363b-451b-a164-f94571075ee5,product design
0,1e4f199c-363b-451b-a164-f94571075ee5,semiconductor
1,6681b1b0-0cea-6a4a-820d-60b15793fa66,hardware


In [221]:
# 회사-funding_round_uuid 매핑
# 카테고리-투자회사-시점 매핑?

In [222]:
df_funding = df_funding_rounds[['company_uuid', 'funding_round_uuid', 'announced_on']]
df_funding = df_funding.merge(df_category, left_on = 'company_uuid', right_on = 'uuid', how = 'inner')
df_funding = df_funding.drop('uuid', axis = 1)
df_funding.columns = ['company_uuid', 'funding_round_uuid', 'date', 'category']
df_funding['year-month'] = df_funding['date'].str[:7]
df_funding.head()

Unnamed: 0,company_uuid,funding_round_uuid,date,category,year-month
0,4f503d57-bf03-4010-6fcc-036fab95b39c,d17ab961-d739-410e-e904-4cb78057cbe1,2017-07-04,financial services,2017-07
1,4f503d57-bf03-4010-6fcc-036fab95b39c,d17ab961-d739-410e-e904-4cb78057cbe1,2017-07-04,fintech,2017-07
2,4f503d57-bf03-4010-6fcc-036fab95b39c,2dd15d97-767c-4eb4-6053-c7a7ce203801,2016-03-15,financial services,2016-03
3,4f503d57-bf03-4010-6fcc-036fab95b39c,2dd15d97-767c-4eb4-6053-c7a7ce203801,2016-03-15,fintech,2016-03
4,bb971d77-d7b8-a3c3-6925-c418077e5d3c,828b3a89-451e-cd0a-6d57-e31dd6a46124,2017-07-04,financial services,2017-07


In [223]:
df_investment = pd.DataFrame(df_investments[['funding_round_uuid', 'investor_uuid']].groupby('funding_round_uuid')['investor_uuid'].apply(list)).reset_index()
df_investment.head()

Unnamed: 0,funding_round_uuid,investor_uuid
0,0001cbd1-f7e2-4a56-607c-c57f46cc7dcb,"[3213cd7d-06e0-3ecb-3359-29aa8deddfa6, 76c506a..."
1,00027faf-5c46-d1a5-0aa6-7649b3218166,[e7e5a267-ce84-b580-ccbf-40a17099d2aa]
2,0003c42b-498e-cd53-aec6-53461d667c79,[39041e62-6b24-ae8d-1347-4cea947e832c]
3,00043331-9b35-507f-990e-72e376cdf7b8,"[07311028-1859-2323-1d9b-0c8601cfa19b, 1ca26a7..."
4,00043864-0037-f6fc-65e9-14162c799655,[387bd749-10f5-65c8-a709-8d22766e9066]


In [224]:
df_funding = df_funding.merge(df_investment, on = 'funding_round_uuid')
df_funding.head()

Unnamed: 0,company_uuid,funding_round_uuid,date,category,year-month,investor_uuid
0,4f503d57-bf03-4010-6fcc-036fab95b39c,d17ab961-d739-410e-e904-4cb78057cbe1,2017-07-04,financial services,2017-07,"[1d91db1b-8aa3-3a98-aca1-b375b327235b, 307c51d..."
1,4f503d57-bf03-4010-6fcc-036fab95b39c,d17ab961-d739-410e-e904-4cb78057cbe1,2017-07-04,fintech,2017-07,"[1d91db1b-8aa3-3a98-aca1-b375b327235b, 307c51d..."
2,4f503d57-bf03-4010-6fcc-036fab95b39c,2dd15d97-767c-4eb4-6053-c7a7ce203801,2016-03-15,financial services,2016-03,"[307c51d2-b1f2-315a-9dcc-87643466bb87, 8ea4648..."
3,4f503d57-bf03-4010-6fcc-036fab95b39c,2dd15d97-767c-4eb4-6053-c7a7ce203801,2016-03-15,fintech,2016-03,"[307c51d2-b1f2-315a-9dcc-87643466bb87, 8ea4648..."
4,bb971d77-d7b8-a3c3-6925-c418077e5d3c,828b3a89-451e-cd0a-6d57-e31dd6a46124,2017-07-04,financial services,2017-07,"[4ede174d-3254-8602-e977-d9c0bfe34433, 94c2fe4..."


In [216]:
categories = ['fintech']
k = 10
color1 = 'blue'

# plot year-month
# get top k% investors
df = df_funding[df_funding['category'].isin(categories)].drop_duplicates('funding_round_uuid')
# df = df.sort_values('year-month')
start_year = min(df['year-month'].values)[:4]
count = df.groupby('year-month').size().rename('count').reset_index()
for year in range(int(start_year), 2018):
    year = str(year)
    if count[count['year-month'] == '{}-01'.format(year)].shape[0] == 0:
        count.loc[-1] = ['{}-01'.format(year), 0]
        count.index = count.index + 1

# count['year'] = np.where(count['year-month'].str[5:7] == '01', count['year-month'].str[:4], '')
count['year'] = np.where(True, count['year-month'].str[:4], '')
count = count.sort_values('year-month')
count['cum_count'] = count['count'].cumsum()
print('categories: {}'.format(categories))

dates = sorted(df['date'].values, reverse = False)
fast_date = dates[int(float(k) / 100 * len(dates))]
print "Early {} percent investors ({} early fundings out of {} total)".format(k, int(float(k) / 100 * len(dates)), len(dates))
df_fast = df[df['date'] < fast_date]

score = df_fast['investor_uuid'].apply(pd.Series, 1).stack()
score.index = score.index.droplevel(-1)
score.name = 'investor'
score = score.reset_index()
score = score.groupby('investor').size().rename('count').reset_index().sort_values('count', ascending = False)
score = score.merge(df_investors[['investor_name', 'uuid']], how = 'inner', left_on = 'investor', right_on = 'uuid')
score = score.drop(['investor', 'uuid'], axis = 1)
print score.head(30)
print score.shape

count[['year-month', 'cum_count']].plot(x = count['year'], linestyle = '-', color = [color1], fontsize = 25, figsize = (30, 8)).legend(loc = 2, fontsize = 20, labels = ['funding count'])

TypeError: only list-like objects are allowed to be passed to isin(), you passed a [NoneType]

In [5]:
df_category.columns = ['uuid', 'category']g = nx.from_pandas_dataframe(df_competition, 'entity_uuid', 'competitor_uuid')
components = [component for component in nx.connected_components(g)]
components = sorted(components, key = lambda x: -len(x))
size = [len(component) for component in components]
nodes = [node for node in g.nodes()]
nodes = sorted(nodes, key = lambda x: len(g.neighbors(x)), reverse = True)
for i in range(10):
    print len(g.neighbors(nodes[i]))

In [8]:
doc_freq = {}
print('loading word2company')
word2company = pickle.load(open(util_folder + 'word2company.pickle'))
print('loading company2word')
company2word = pickle.load(open(util_folder + 'company2word.pickle'))

loading word2company
loading company2word


In [9]:
for word, companies in word2company.items():
    doc_freq[word] = len(companies)

In [14]:
for component in components:
    print 'size:',len(component)
    component_freq = {}
    for company in component:
        for word in company2word.get(company, []):
            component_freq[word] = component_freq.get(word, 0) + 1
    score = {}
#     print component_freq
    for word, count in component_freq.items():
#         print word, count, doc_freq.get(word, 0)
        if doc_freq.get(word, 0) > np.sqrt(len(component)):
            score[word] = count / float(doc_freq[word])
    score = sorted(score.items(), key = operator.itemgetter(1), reverse = True)
    for word, score in score[:50]:
        print word, score
    print
    break

size: 34038
myspace 0.429906542056
file_sharing 0.357142857143
information_visit 0.355745721271
widgets 0.352534562212
widget 0.351063829787
ad_networks 0.348039215686
lets_users 0.344173441734
powers 0.33625
foursquare 0.335135135135
drag 0.332506203474
unique_visitors 0.332317073171
techcrunch 0.329113924051
rss 0.326732673267
display_advertising 0.325358851675
twitter_facebook 0.31746031746
daily_deal 0.317307692308
programmatic 0.316901408451
playlists 0.316017316017
aol 0.315573770492
coca_cola 0.314136125654
user_engagement 0.313725490196
wiki 0.312195121951
page_views 0.311926605505
inbox 0.309090909091
white_label 0.306946688207
user_generated 0.30602006689
registered_users 0.304597701149
yelp 0.30243902439
paypal 0.301075268817
salesforce_com 0.300751879699
photo_sharing 0.300380228137
tagging 0.298113207547
vmware 0.297709923664
enterprise_grade 0.297397769517
homepage 0.297297297297
visit_www 0.296116504854
embed 0.295977011494
ebay 0.29203539823
apis 0.288888888889
twitter 