In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#from pandas_profiling import ProfileReport
import plotly.figure_factory as ff
import plotly.graph_objects as go
import datetime 
from dateutil.parser import parse
import nbformat

In [61]:
#mass index modifier
def set_indexes(dfs, index_list):
    for df, index in zip(dfs, index_list):
        df.set_index(index, inplace=True, drop=True)

def keyword_calculator(df, sector_name, exclude, include):
    print('Sector name: {}'.format(sector_name))
    #data cleansing
    temp_df = df[['Sub-sector']][df['Sector'] == sector_name]
    temp_list = list(temp_df['Sub-sector'].values)
    #add included values
    temp_list.extend(include)

    #remove excluded values
    temp_list = [x for x in temp_list if x not in exclude]

    result_list = list(dict.fromkeys(temp_list))
    #print the item count
    print('Item count: {}'.format(len(result_list)))
    print(result_list)
    #put a seperator
    print('----------------------------------\n')

def stack_histogram_creator(col_name_x, col_name_y, types_array, nbins, title, output_name):

    fig = go.Figure()

    for item in types_array:
        temp_df = pipeline[pipeline[col_name_x] == item]
        fig.add_trace(go.Histogram(x=temp_df[col_name_y], name=item, nbinsx=nbins))
    
    fig.update_layout(barmode='stack')
    fig.update_layout(title_text=title, title_x=0.7, title_y=0.92, xaxis_title='Time', yaxis_title=col_name_x)
    fig.write_html(os.path.join(REPORT_FOLDER, output_name))
    fig.show()

In [62]:
RAW_DATA_FOLDER = 'data/raw/'
OUTPUT_DATA_FOLDER = 'data/output/'
REPORT_FOLDER = 'reports/'

#sectors and countries that we are interested
sector_list = ['Energy Storage & Management', 'Industry 4.0', 'Mobility & Automotive', 'Supply Chain & Logistics', 'Other']
country_list = ['Germany', 'Turkey', 'United Kingdom', 'Other']

pipeline = pd.read_csv(os.path.join(RAW_DATA_FOLDER, 'pipeline.csv'))

#set_indexes([energy, industry, sc_logistics, mobility], ["id", "id", "id", "id"])

In [63]:
#standardize the date input
pipeline.rename(columns={'Day Created': 'Date'}, inplace=True)
pipeline['Date'] = pipeline['Date'].str.replace('.', '-')
pipeline['Date'] = pipeline['Date'].str.replace('/', '-')
pipeline.loc[~pipeline['Country'].isin(country_list), 'Country'] = 'Other'
#get rid of irrelevant rows
pipeline.dropna(subset = ['Sector'], inplace=True)
#adjust time format
pipeline["Date"] = pd.to_datetime(pipeline["Date"]).dt.strftime('%Y-%m-%d')
pipeline["Date"] = pipeline["Date"].apply(lambda x : parse(x))

In [64]:
time_limit = pd.Timestamp(2018,1,1)

In [65]:
pipeline = pipeline[pipeline['Date'] > time_limit]

In [66]:
pipeline[['Name', 'Sector', 'Country', 'Date']].sort_values(by=['Date'] , ascending=True)

Unnamed: 0,Name,Sector,Country,Date
18,Haptx,Other,Other,2018-01-04
47,Valens,Mobility & Automotive,Other,2018-01-08
112,OLEV,Mobility & Automotive,Turkey,2018-01-10
111,Nio,Mobility & Automotive,Turkey,2018-01-10
7,Brill Power,Energy Storage & Management,United Kingdom,2018-01-10
...,...,...,...,...
1034,HamurLabs,Supply Chain & Logistics,Turkey,2020-12-17
1035,Quimera,Energy Storage & Management,United Kingdom,2020-12-21
1036,Unco Global,Supply Chain & Logistics,Germany,2020-12-22
1037,Noil,Mobility & Automotive,Other,2020-12-22


In [67]:
#print('min: {}'.format(min(pipeline['Date'])))
#pipeline_length = pipeline['Date'].count()

In [68]:
stack_histogram_creator('Country', 'Date', country_list, 1000, 'Country Distribution over Time', 'country_per_date.html')

In [69]:
stack_histogram_creator('Sector', 'Date', sector_list, 1000, 'Sector Distribution over Time', 'sector_per_date.html')

In [70]:
#keep these arrays in case they are needed
sector_list = ['Energy', 'Industry 4.0', 'Mobility', 'Supply Chain']
country_list = ['Germany', 'Turkey', 'United Kingdom']

In [71]:
#remove all NaN values
pipeline.dropna(subset = ['Sub-sector'], inplace=True)
pipeline['Sub-sector'] = pipeline['Sub-sector'].str.lower()

In [72]:
#drop all Other columns
pipeline.drop(pipeline[pipeline['Sector'] == 'Other'].index, inplace=True)
pipeline.drop_duplicates(subset='Sub-sector', keep="first", inplace=True)

In [73]:
#observe different sectors
sector_list = pipeline['Sector'].unique()

#empty array to fill afterwards
exclude_include_data = [[[], []], [[], []], [[], []], [[], []]]

##No repeating keywords !!!
#Energy Storage & Management
#exclude [0]
exclude_include_data[0][0] = ['']
#include [1]
exclude_include_data[0][1] = ['battery', 'grid', 'electric vehicle charging']

#Mobility & Automotive
exclude_include_data[1][0] = ['data privacy', 'image recognition', 'electric vehicles']
exclude_include_data[1][1] = ['mobility', 'electric vehicle' , 'charging' ,'micromobility', 'bike', 'mobility as a service']

#Supply Chain & Logistics
exclude_include_data[2][0] = ['nlp / semantic-tech', 'semantic-tech', 'nlp', 'e-commerce ']
exclude_include_data[2][1] = ['blockchain', 'traceability', 'drone', 'freight forwarding', 'automated guided vehicle']

#Industry 4.0
exclude_include_data[3][0] = ['healthcare', 'display']
exclude_include_data[3][1] = ['augmented reality', 'virtual reality', 'manufacturing', 'low code ai', 'no code ai']

In [74]:
for sector, exclude_include_datum in zip(list(sector_list), exclude_include_data):
    keyword_calculator(pipeline, sector, exclude_include_datum[0], exclude_include_datum[1])

Sector name: Energy Storage & Management
Item count: 15
['flywheel', 'batteries', 'energy optimisation', 'sensor', 'grid management', 'circulor economy/recycling', 'hydrogen', 'energy generation', 'energy trading', 'cooling', 'concentrated solar power', 'fuel cell', 'battery', 'grid', 'electric vehicle charging']
----------------------------------

Sector name: Mobility & Automotive
Item count: 28
['autonomous mobility', 'automotive supplier', 'ev charging', 'car-sharing', 'fleet management', 'travel', 'space tech', 'hyperloop', 'micromobility charging', 'scooter operator/device', 'ai optimisation ', 'computer vision', 'cycling camera', 'e-bikes', 'lidar', 'vehicle mfg', 'used cars', 'mobility platform', 'insurance', 'energy harvesting', 'engine', 'maas', 'mobility', 'electric vehicle', 'charging', 'micromobility', 'bike', 'mobility as a service']
----------------------------------

Sector name: Supply Chain & Logistics
Item count: 16
['waste management', 'delivery', 'drone/flight tech