In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime 
from dateutil.parser import parse
import nbformat

In [None]:
#mass index modifier
def set_indexes(dfs, index_list):
    for df, index in zip(dfs, index_list):
        df.set_index(index, inplace=True, drop=True)

def country_creator(sector_df, country_list):
    #rename Headquarters Location to country
    sector_df.rename(columns={'Headquarters Location': 'Country'}, inplace=True)
    for country_name in country_list:  
        sector_df.loc[sector_df['Country'].str.contains(country_name), 'Country'] = country_name

def sorter(df, column_names, categories):
    temp_array = []
    for column_name, category in zip(column_names, categories):
        df[column_name] = df[column_name].astype("category")
        df[column_name].cat.set_categories(category, inplace=True)
    return df.sort_values(column_names)

In [None]:
#works on the pipeline file
PIPELINE = True #overwrites the keyword variable
#keyword search on Crunchbase
KEYWORD = False

if KEYWORD:
    JOKER_STRING = 'keyword_'
else:
    JOKER_STRING = ''

if PIPELINE:
    JOKER_STRING = 'pipeline_'
    KEYWORD = False

#apply folder tree
RAW_DATA_FOLDER = 'data/raw/'
OUTPUT_DATA_FOLDER = 'data/output/'
REPORT_FOLDER = 'reports/'

#sectors and countries that we are interested
sector_list = ['Energy', 'Industry 4.0', 'Mobility', 'Supply Chain']
country_list = ['USA+Israel','Germany', 'Turkey', 'United Kingdom']

if PIPELINE:
    sector_list.append('Other')
    country_list.insert(0, 'Other')

#sector_list.append('Total')
#country_list.append('Total')

#import data
if not PIPELINE:
    energy = pd.read_csv(os.path.join(RAW_DATA_FOLDER, '{}energy.csv'.format(JOKER_STRING)))
    industry = pd.read_csv(os.path.join(RAW_DATA_FOLDER, '{}industry.csv'.format(JOKER_STRING)))
    sc_logistics = pd.read_csv(os.path.join(RAW_DATA_FOLDER, '{}sc_logistics.csv'.format(JOKER_STRING)))
    mobility = pd.read_csv(os.path.join(RAW_DATA_FOLDER, '{}mobility.csv'.format(JOKER_STRING)))
else:
    pipeline = pd.read_csv(os.path.join(RAW_DATA_FOLDER, 'pipeline.csv'))
    #other sectors are other
    pipeline.loc[pipeline['Country'].isin(['USA', 'Israel']), 'Country'] = 'USA+Israel'
    pipeline.loc[~pipeline['Country'].isin(country_list), 'Country'] = 'Other'
    energy = pipeline[pipeline['Sector'] == 'Energy Storage & Management']
    industry = pipeline[pipeline['Sector'] == 'Industry 4.0']
    total = pipeline[pipeline['Sector'] == 'Industry 4.0']
    sc_logistics = pipeline[pipeline['Sector'] == 'Supply Chain & Logistics']
    mobility = pipeline[pipeline['Sector'] == 'Mobility & Automotive']
    other = pipeline[pipeline['Sector'] == 'Other']

#set_indexes([energy, industry, sc_logistics, mobility], ["id", "id", "id", "id"])

In [None]:
#create a new column for with our definitions
energy['Sector*'] = sector_list[0]
industry['Sector*'] = sector_list[1]
mobility['Sector*'] = sector_list[2]
sc_logistics['Sector*'] = sector_list[3]  

if PIPELINE:
    other['Sector*'] = 'Other' 

In [None]:
if not PIPELINE:
    sector_dfs = [energy, industry, sc_logistics, mobility]
    #apply country function
    for sector_df in sector_dfs:
        country_creator(sector_df, country_list)

In [None]:
if not PIPELINE:
    #get a complete table
    all_tables = pd.concat([energy, industry, sc_logistics, mobility])
    all_tables.drop_duplicates(subset='Organization Name', keep="first", inplace=True)
else:
    all_tables = pd.concat([energy, industry, sc_logistics, mobility, other])
    all_tables.drop_duplicates(subset='Name', keep="first", inplace=True)
#output company lists
#drop duplicates

all_tables.to_csv(os.path.join(OUTPUT_DATA_FOLDER, '{}company_list.csv'.format(JOKER_STRING)), index=False)

In [None]:
#get company distributions by country and sector
new_array = all_tables.groupby(['Country', 'Sector*']).size().values
new_array

In [None]:
all_tables = all_tables.groupby(['Country', 'Sector*']).size().to_frame('size').reset_index()
all_tables.rename(columns={'size': 'Size'}, inplace=True)
all_tables = sorter(all_tables, ['Country', 'Sector*'], [country_list, sector_list])
new_array = np.array(all_tables[['Size']])

In [None]:
#reshape the numpy array
if PIPELINE:
    reshaped_array = np.reshape(new_array, (5, 5))
else:
    reshaped_array = np.reshape(new_array, (3, 4))

In [None]:
#create the heatmap
z = reshaped_array
x = sector_list
y = country_list

z_text = reshaped_array

fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='reds', showscale=True)

if PIPELINE:
    title = 'Pipeline File Country/Sector Distribution'    
else:
    if KEYWORD:
        title = 'Crunchbase Country/Sector Distribution (Keyword)'   
    else:
        title = 'Crunchbase Country/Sector Distribution (Industry Filter)'  

#place the title string
fig.update_layout(title_text=title, title_x=0.5, title_y=0.94, xaxis_title='Sector', xaxis_title_standoff=0.6, yaxis_title='Country', yaxis_categoryarray=country_list)

#html output
fig.write_html(os.path.join(REPORT_FOLDER, '{}origination_heatmap.html'.format(JOKER_STRING)))
fig.show()