In [1]:
# 주제/키워드가 주어졌을때 다음을 시각화 한다
# 1. 구글 트렌드
# 2. 투자
# 3. 인수

In [2]:
import pandas as pd
import re
import gensim
import pickle
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [3]:
# load data
data_folder = '../data/csv_export/'
util_folder = '../util/'
trend_folder = '../data/trends/'
df_organizations = pd.read_csv(data_folder + 'organizations.csv'.format(data_folder), dtype={'first_funding_on': str, 'last_funding_on':str})
df_description = pd.read_csv(data_folder + 'organization_descriptions.csv')
df_funding_rounds = pd.read_csv(data_folder + 'funding_rounds.csv')
df_funds = pd.read_csv(data_folder + 'funds.csv')
df_investments = pd.read_csv(data_folder + 'investments.csv')
df_acq = pd.read_csv(data_folder + 'acquisitions.csv')

In [6]:
'''
주제가 주어졌을때 다음을 시각화 한다.
1. 구글 트렌드
2. 투자
3. 인수

keyword: google trend keyword (사전에 keyword.csv 이름의 파일을 trends.google.com 에서 다운 받아야 함)
words: 회사 설명에서 다음과 같은 단어가 있으면 포함시킨다.
expand: 기본 단어외의 관련 단어를 추가할 것인가?
plot_all: 3가지가 모두 나온 하나의 그래프 외에, 개별 그래프 3가지를 따로 표시할 것인가?
start_date: 시작 시점
'''
color1 = 'black'
color2 = 'grey'
color3 = '#80bfff'
color4 = 'red'


def visualize(keyword, words, expand = False, plot_all = False, start_date = '2004-01', cumulative = False):
    fig = plt.figure()
    condition = 'first_funding_on'

    # 1. google web search trend data
    df_google_trend = pd.read_csv(trend_folder + keyword + '.csv', header = 1)  
    column1, column2 = df_google_trend.columns

    df_google_trend['google_count'] = df_google_trend[column2].groupby(df_google_trend[column1].str[:7]).transform('mean')
    df_google_trend['year-month'] = df_google_trend[column1].str[:7]
    df_google_trend = df_google_trend.loc[:,['year-month', 'google_count']]
    df_google_trend = df_google_trend.drop_duplicates()
    df_google_trend.columns = ['year-month', 'google_count']
    if plot_all:
        df_google_trend.plot(kind = 'line', x = df_google_trend['year-month'], figsize = (30, 8))

    # 2. google news trend data
    df_google_news_trend = pd.read_csv(trend_folder + keyword + '_news.csv', header = 1)  
    column1, column2 = df_google_news_trend.columns
    df_google_news_trend['google_news_count'] = df_google_news_trend[column2].groupby(df_google_news_trend[column1].str[:7]).transform('mean')

    df_google_news_trend['year-month'] = df_google_news_trend[column1].str[:7]
    df_google_news_trend = df_google_news_trend.loc[:,['year-month', 'google_news_count']]
    df_google_news_trend = df_google_news_trend.drop_duplicates()
    df_google_news_trend.columns = ['year-month', 'google_news_count']
    if plot_all:
        df_google_news_trend.plot(kind = 'line', x = df_google_news_trend['year-month'], figsize = (30, 8))

    # load required dataset
    if 'word2company' not in globals():
        print('loading word2company')
        global word2company
        word2company = pickle.load(open(util_folder + 'word2company.pickle', 'rb'))
    if 'model' not in globals():
        print('loading word2vec model')
        global model
        model = gensim.models.word2vec.Word2Vec.load(util_folder + 'word2vec')

    # expand word set if necessary
    # train 된 word2vec 을 사용하여 관련도가 높은 단어들을 포함한다
    final_words = []
    if type(words) == str:
        words = [words]
    for word in words:
        if ' ' in word:
            word = word.replace(' ', '_')
        if expand:
            if word not in model: continue
            _words = model.most_similar(word)
            _words = [str(_word) for _word, sim in _words]
            final_words.extend(_words)
        final_words.append(word)
    final_words = list(set(final_words))

    # choose companies that have relevant words
    companies = []
    for word in final_words:
        _companies = word2company.get(word, [])
        companies.extend(_companies)
    companies = list(set(companies))

    if not cumulative: # just to print once
        print('Google Trend: {}\n Companies with keywords:{}'.format(keyword, final_words))

    # 회사를 시기 별로 정리한다
    # 3. funding
#     df_funding = df_funding_rounds[['company_uuid', 'funding_round_uuid', 'announced_on']]
    df = pd.DataFrame(pd.concat([df_funding_rounds[df_funding_rounds['company_uuid'].isin(companies)]['announced_on']]))
    df_companies = df_organizations[df_organizations['uuid'].isin(companies)]
    df = pd.DataFrame()
    df['date'] = df_companies["first_funding_on"]
    df = df[~df['date'].isnull()]
    df['year-month'] = df['date'].str[:7]
    
    funding = df.groupby(df['year-month']).size().reset_index()
    funding.columns = ['year-month', 'funding_count']
    funding = funding[funding['year-month'] >= start_date]
    if plot_all:
        funding.plot(kind = 'bar', x = funding['year-month'], figsize = (30, 8))


    df_merged = pd.merge(df_google_trend, funding, on = 'year-month', how = 'outer')
    df_merged = df_merged.merge(df_google_news_trend, on = 'year-month', how = 'outer')

    # 4. acquisition
    df = df_acq[df_acq['acquiree_uuid'].isin(companies)]
    if not cumulative:
#         print(df[['acquiree_name', 'acquirer_name', 'acquired_on']].sort_values('acquired_on'))
        to_print = df.sort_values('acquired_on')[['acquired_on', 'acquiree_name']]
        to_print['acquiree_name'] = to_print['acquiree_name'].apply(lambda x: x[:15])
        to_print = to_print.to_string(index = False)
        print(to_print)
#         print df.to_string(index=False)

    df = df.loc[:,['acquired_on']]
    df['year-month'] = df['acquired_on'].str[:7]
    acq = df.groupby(df['year-month']).size().reset_index()
    acq.columns = ['year-month', 'acq_count']
    acq = acq[acq['year-month'] >= start_date]

    if plot_all:
        acq.plot(kind = 'bar', x = acq['year-month'], figsize = (30, 8))

    df_merged = pd.merge(df_merged, acq, on = 'year-month', how = 'outer')
    df_merged = df_merged.sort_values(['year-month'])    
    df_merged['year'] = np.where(df_merged['year-month'].str[5:7] == '01', df_merged['year-month'].str[:4], '')

    if cumulative:
        df_merged['funding_count'] = df_merged['funding_count'].fillna(0)
        df_merged['funding_count'] = df_merged['funding_count'].cumsum()

        div_by = 100.0 / (df_merged['funding_count'].max() + 1)
        df_merged['google_count'] /= div_by
        df_merged['google_news_count'] /= div_by

        df_merged['acq_count'] = df_merged['acq_count'].fillna(0)
        df_merged['acq_count'] = df_merged['acq_count'].cumsum()
        df_merged['acq_count'] /= ((df_merged['acq_count'].max()) / (df_merged['funding_count'].max() + 1))

        df_merged[['year-month', 'google_count', 'google_news_count', 'funding_count', 'acq_count']].plot(x = df_merged['year'], linestyle = '-', color = [color1, color2, color3, color4], fontsize = 25, figsize = (30, 8)).legend(loc = 2, fontsize = 20, labels = ['google web search trend', 'google news trend', '# funding',  '# M&A',])


    else:        

        div_by = 100.0 / (df_merged['funding_count'].max() + 1)
        df_merged['google_count'] /= div_by
        df_merged['google_news_count'] /= div_by


        ax = df_merged[['year-month', 'google_count', 'google_news_count']].plot(x = df_merged['year'], linestyle = '-', color = [color1, color2])
        df_merged[['year-month', 'funding_count', 'acq_count']].plot(x = df_merged['year'], kind = 'bar', rot = 0, ax = ax, fontsize = 25, figsize = (30, 8), color = [color3, color4]).legend(loc = 2, fontsize = 20, labels = ['google web search trend', 'google news trend', '# funding', '# M&A'])

def visualize_all(keyword, words, expand = False, plot_all = False, start_date = '2004-01'):
    visualize(keyword, words, expand = expand, plot_all = plot_all, start_date = start_date, cumulative = False)
    visualize(keyword, words, expand = expand, plot_all = plot_all, start_date = start_date, cumulative = True)

In [7]:
visualize_all('augmented_reality', ['augmented_reality'], expand = False, plot_all = False)

loading word2company
loading word2vec model
Google Trend: augmented_reality
 Companies with keywords:['augmented_reality']


KeyError: 'year-month'

<matplotlib.figure.Figure at 0x7fe38cff74a8>

In [None]:
visualize_all('wearable', ['wearable', 'wearables', 'smartwatch', 'smart_glasses', 'wearable_device', 'wrist_worn', 'wearable_sensor', 'wearable_devices', 'wearable', 'wearable_tech', 'wearable_technology'], plot_all = False)

In [None]:
visualize_all('iot', 'iot', expand = False, plot_all = False)

In [None]:
visualize_all('bitcoin', 'bitcoin', expand = True, plot_all = False)

In [None]:
visualize_all('blockchain', 'blockchain', expand = False, plot_all = False)

In [None]:
visualize_all('deep_learning', 'deep learning', expand = False, plot_all = False)

In [None]:
visualize_all('drone', 'drone', expand = True, plot_all = False)

In [None]:
visualize_all('social_media', 'social_media', expand = False, plot_all = False)

In [None]:
visualize_all('self_driving_car', ['self driving', 'autonomous driving', 'driverless', 'autonomous vehicle'], expand = False, plot_all = False)

In [None]:
visualize_all('neuroscience', ['neuroscience'], expand = False, plot_all = False)

In [None]:
visualize_all('mobile_health_care', ['mobile health', 'mobile healthcare'], expand = True, plot_all = False)

In [None]:
visualize_all('social_robot', 'social robot', expand = True, plot_all = False)

In [None]:
visualize_all('lidar', 'lidar', expand = False, plot_all = False)

In [None]:
# visualize_all('chatbot', ['chatbot', 'conversational', 'chat bot', 'ibm watson', 'siri'], expand = False, plot_all = False)
visualize_all('chatbot', ['chatbot', 'chat bot', 'ibm watson', 'siri'], expand = False, plot_all = False)

In [None]:
visualize_all('fintech', 'fintech', expand = False, plot_all = False)

In [None]:
visualize_all('3d_printing', '3d printing', expand = True, plot_all = False)

In [None]:
visualize_all('quantum_computing', 'quantum_computing', expand = False, plot_all = False)