In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

from interfaces.db import DB

from investment.concept_configuration import normalise

In [3]:
db = DB()

## Fetch data from database

In [4]:
# Get EPFLStartups
table_name = 'graph_piper.Nodes_N_EPFLStartup'
fields = ['EPFLStartupID', 'StartupName', 'Industry']
conditions = {'Status': 'Private'}
startups = pd.DataFrame(db.find(table_name, fields=fields, conditions=conditions), columns=fields)

In [5]:
# Get startups' founders
table_name = 'graph_piper.Edges_N_EPFLStartup_N_Person_T_Founder'
fields = ['EPFLStartupID', 'SCIPER']
startups_founders = pd.DataFrame(db.find(table_name, fields=fields), columns=fields)
founder_ids = list(startups_founders['SCIPER'].drop_duplicates())

In [6]:
# Get startups' professors
table_name = 'graph_piper.Edges_N_EPFLStartup_N_Person_T_Professor'
fields = ['EPFLStartupID', 'SCIPER']
startups_professors = pd.DataFrame(db.find(table_name, fields=fields), columns=fields)
professor_ids = list(startups_professors['SCIPER'].drop_duplicates())

In [7]:
# Get founders' concepts
table_name = 'graph_piper.Edges_N_Person_N_Concept_T_Research'
fields = ['SCIPER', 'PageID', 'Score']
conditions = {'SCIPER': founder_ids}
founders_concepts = pd.DataFrame(db.find(table_name, fields=fields, conditions=conditions), columns=fields)
founders_concept_ids = list(founders_concepts['PageID'].drop_duplicates())

In [8]:
# Get professors' concepts
table_name = 'graph_piper.Edges_N_Person_N_Concept_T_Research'
fields = ['SCIPER', 'PageID', 'Score']
conditions = {'SCIPER': professor_ids}
professors_concepts = pd.DataFrame(db.find(table_name, fields=fields, conditions=conditions), columns=fields)
professors_concept_ids = list(professors_concepts['PageID'].drop_duplicates())

In [9]:
# Get concepts
table_name = 'graph_piper.Nodes_N_Concept_T_Title'
fields = ['PageID', 'PageTitle']
conditions = {'PageID': founders_concept_ids + professors_concept_ids}
concepts = pd.DataFrame(db.find(table_name, fields=fields, conditions=conditions), columns=fields)

## Merge tables to obtain EPFLStartup-Concept edges

In [10]:
startups

Unnamed: 0,EPFLStartupID,StartupName,Industry
0,es-abionic,Abionic,Medtech
1,es-aeds,AEDS,Mechanical
2,es-aeler-technologies,Aeler Technologies,Mechanical
3,es-aesyra-aesybyte,Aesyra,Medtech
4,es-aica,Aica,Mechanical
...,...,...,...
252,es-xsensio-sarl,Xsensio Sarl,Micro-nanotech
253,es-xtenso,Xtenso,ICT
254,es-zace,Zace,ICT
255,es-zaphiro-technologies,Zaphiro Technologies,Electrical-electronics


In [11]:
startups_founders = pd.merge(startups, startups_founders, how='inner', on='EPFLStartupID')
startups_founders_concepts = pd.merge(startups_founders, founders_concepts, how='inner', on='SCIPER')
startups_founders_concepts = startups_founders_concepts.groupby(by=['EPFLStartupID', 'PageID']).aggregate({'Score': 'sum'}).reset_index()
startups_founders_concepts = pd.merge(startups, startups_founders_concepts, how='inner', on='EPFLStartupID')

In [12]:
startups_professors = pd.merge(startups, startups_professors, how='inner', on='EPFLStartupID')
startups_professors_concepts = pd.merge(startups_professors, professors_concepts, how='inner', on='SCIPER')
startups_professors_concepts = startups_professors_concepts.groupby(by=['EPFLStartupID', 'PageID']).aggregate({'Score': 'sum'}).reset_index()
startups_professors_concepts = pd.merge(startups, startups_professors_concepts, how='inner', on='EPFLStartupID')

In [13]:
# We give priority to the founders' concepts and use the professors' concepts as fallback
startup_with_founder_ids = list(startups_founders_concepts['EPFLStartupID'].drop_duplicates())
startups_professors_concepts = startups_professors_concepts[~startups_professors_concepts['EPFLStartupID'].isin(startup_with_founder_ids)]

In [14]:
startups_concepts = pd.concat([startups_founders_concepts, startups_professors_concepts]).reset_index(drop=True)

In [15]:
startups_concepts = normalise(startups_concepts)

In [16]:
startup_ids = list(startups_concepts['EPFLStartupID'].drop_duplicates())
startups = startups[startups['EPFLStartupID'].isin(startup_ids)].reset_index(drop=True)

In [17]:
concept_ids = list(startups_concepts['PageID'].drop_duplicates())
concepts = concepts[concepts['PageID'].isin(concept_ids)].reset_index(drop=True)

In [18]:
nodes = pd.concat([
    startups.rename(columns={'EPFLStartupID': 'ID', 'StartupName': 'Name'})[['ID', 'Name']],
    concepts.rename(columns={'PageID': 'ID', 'PageTitle': 'Name'})
]).reset_index(drop=True)

In [19]:
edges = startups_concepts.rename(columns={'EPFLStartupID': 'SourceID', 'StartupName': 'Name', 'PageID': 'TargetID', 'Score': 'Weight'})[['SourceID', 'TargetID', 'Weight']]

In [20]:
import networkx as nx

In [21]:
node_attrs = {node['ID']: {'name': node['Name']} for node in nodes.to_dict(orient='records')}

In [22]:
edges = [(edge['SourceID'], edge['TargetID'], {'weight': edge['Weight']}) for edge in edges.to_dict(orient='records')]

In [23]:
G = nx.Graph()

G.add_nodes_from(node_attrs.keys())
nx.set_node_attributes(G, node_attrs)
G.add_edges_from(edges)

In [24]:
G

<networkx.classes.graph.Graph at 0x10ff66dc0>

In [25]:
len(nodes), len(node_attrs), len(node_attrs.keys()), len(G.nodes)

(7093, 7093, 7093, 7115)

In [12]:
nx.write_gexf(G, "test.gexf")