# Startup Ecosystem Analysis: Insights for Dutch Policy

**Techleap Senior Data Analyst Case Study**

This notebook analyzes the Crunchbase startup investments dataset to derive actionable insights for the Ministry of Economic Affairs regarding the Dutch startup ecosystem.

---

## Table of Contents
1. Data Loading & Cleaning
2. Global Overview
3. Dutch Ecosystem Analysis with Global Benchmarking
4. Funding Round Progression Analysis
5. Survival Patterns & Outcomes
6. Visualizations for Report
7. Key Findings Summary

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter
import warnings
warnings.filterwarnings('ignore')

# Set style for professional visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

# Formatting helper functions
def format_billions(x, pos):
    return f'${x/1e9:.1f}B'

def format_millions(x, pos):
    return f'${x/1e6:.1f}M'

def format_thousands(x, pos):
    return f'{x/1e3:.1f}K'

print("Libraries loaded successfully!")

## 1. Data Loading & Cleaning

In [None]:
# Load the dataset
df = pd.read_csv('../investments_VC.csv', encoding='latin-1', low_memory=False)

print(f"Dataset loaded: {len(df):,} companies")
print(f"Columns: {len(df.columns)}")
print(f"\nColumn names:")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2}. '{col}'")

In [None]:
# Clean the data
# 1. Clean column names (remove whitespace)
df.columns = df.columns.str.strip()

# 2. Clean funding_total_usd - remove commas, spaces, and convert to numeric
df['funding_usd'] = df['funding_total_usd'].astype(str).str.replace(',', '').str.replace(' ', '').str.strip()
df['funding_usd'] = pd.to_numeric(df['funding_usd'], errors='coerce')

# 3. Clean market column
df['market'] = df['market'].str.strip()

# 4. Convert date columns
df['first_funding_date'] = pd.to_datetime(df['first_funding_at'], errors='coerce')
df['last_funding_date'] = pd.to_datetime(df['last_funding_at'], errors='coerce')
df['founded_date'] = pd.to_datetime(df['founded_at'], errors='coerce')

# 5. Extract years
df['first_funding_year'] = df['first_funding_date'].dt.year
df['last_funding_year'] = df['last_funding_date'].dt.year

# 6. Fill missing status with 'unknown'
df['status'] = df['status'].fillna('unknown')

print("Data cleaning completed!")
print(f"\nCompanies with funding data: {df['funding_usd'].notna().sum():,}")
print(f"Companies with founded year: {df['founded_year'].notna().sum():,}")
print(f"Companies with first funding date: {df['first_funding_date'].notna().sum():,}")

In [None]:
# Quick data quality check
print("=== DATA QUALITY SUMMARY ===")
print(f"\nTotal companies: {len(df):,}")
print(f"\nStatus distribution:")
print(df['status'].value_counts())
print(f"\nTop 10 countries:")
print(df['country_code'].value_counts().head(10))
print(f"\nFunding statistics:")
print(f"  Total funding: ${df['funding_usd'].sum():,.0f}")
print(f"  Mean: ${df['funding_usd'].mean():,.0f}")
print(f"  Median: ${df['funding_usd'].median():,.0f}")
print(f"  Max: ${df['funding_usd'].max():,.0f}")

## 2. Global Overview

In [None]:
# Global statistics
print("=== GLOBAL ECOSYSTEM OVERVIEW ===")
print(f"\nTotal companies: {len(df):,}")
print(f"Total funding: ${df['funding_usd'].sum()/1e9:.1f}B")
print(f"Countries represented: {df['country_code'].nunique()}")
print(f"Market categories: {df['market'].nunique()}")
print(f"\nTime span:")
print(f"  Earliest funding: {df['first_funding_year'].min():.0f}")
print(f"  Latest funding: {df['last_funding_year'].max():.0f}")
print(f"  Peak founding year: {df['founded_year'].mode().values[0]:.0f}")

In [None]:
# Funding by country (top 15)
country_funding = df.groupby('country_code').agg({
    'name': 'count',
    'funding_usd': 'sum',
    'funding_rounds': 'mean'
}).rename(columns={'name': 'company_count', 'funding_rounds': 'avg_rounds'})

country_funding['avg_funding'] = country_funding['funding_usd'] / country_funding['company_count']
country_funding = country_funding.sort_values('funding_usd', ascending=False)

print("=== TOP 15 COUNTRIES BY TOTAL FUNDING ===")
top15 = country_funding.head(15).copy()
top15['funding_usd_display'] = top15['funding_usd'].apply(lambda x: f'${x/1e9:.2f}B' if x >= 1e9 else f'${x/1e6:.0f}M')
top15['avg_funding_display'] = top15['avg_funding'].apply(lambda x: f'${x/1e6:.1f}M')
print(top15[['company_count', 'funding_usd_display', 'avg_funding_display', 'avg_rounds']].to_string())

## 3. Dutch Ecosystem Analysis with Global Benchmarking

In [None]:
# Dutch startup ecosystem
dutch = df[df['country_code'] == 'NLD'].copy()

print("=== DUTCH STARTUP ECOSYSTEM ===")
print(f"\nTotal Dutch companies: {len(dutch):,}")
print(f"Total Dutch funding: ${dutch['funding_usd'].sum()/1e9:.2f}B")
print(f"Average funding per company: ${dutch['funding_usd'].mean()/1e6:.1f}M")
print(f"Median funding: ${dutch['funding_usd'].median()/1e6:.1f}M")
print(f"\nStatus distribution:")
print(dutch['status'].value_counts())
print(f"\nOperating rate: {(dutch['status'] == 'operating').mean()*100:.1f}%")

In [None]:
# Benchmark Netherlands against peer countries
benchmark_countries = ['USA', 'GBR', 'DEU', 'NLD', 'ISR', 'FRA', 'CHN', 'CAN']
benchmark = country_funding.loc[country_funding.index.isin(benchmark_countries)].copy()
benchmark = benchmark.sort_values('funding_usd', ascending=False)

# Calculate global averages for comparison
global_avg_funding = df['funding_usd'].mean()
global_avg_rounds = df['funding_rounds'].mean()
global_operating_rate = (df['status'] == 'operating').mean() * 100

# Add operating rate to benchmark
for country in benchmark_countries:
    country_data = df[df['country_code'] == country]
    if len(country_data) > 0:
        benchmark.loc[country, 'operating_rate'] = (country_data['status'] == 'operating').mean() * 100

print("=== BENCHMARK: NETHERLANDS vs PEERS ===")
print(f"\nGlobal averages:")
print(f"  Average funding: ${global_avg_funding/1e6:.1f}M")
print(f"  Average rounds: {global_avg_rounds:.2f}")
print(f"  Operating rate: {global_operating_rate:.1f}%")
print(f"\n{'Country':<8} {'Companies':>10} {'Total Funding':>15} {'Avg Funding':>12} {'Avg Rounds':>12} {'Op. Rate':>10}")
print("-" * 70)
for idx, row in benchmark.iterrows():
    funding_str = f"${row['funding_usd']/1e9:.1f}B" if row['funding_usd'] >= 1e9 else f"${row['funding_usd']/1e6:.0f}M"
    avg_str = f"${row['avg_funding']/1e6:.1f}M"
    print(f"{idx:<8} {int(row['company_count']):>10,} {funding_str:>15} {avg_str:>12} {row['avg_rounds']:>12.2f} {row['operating_rate']:>9.1f}%")

In [None]:
# Dutch sector analysis
dutch_markets = dutch.groupby('market').agg({
    'name': 'count',
    'funding_usd': 'sum'
}).rename(columns={'name': 'company_count'})
dutch_markets['avg_funding'] = dutch_markets['funding_usd'] / dutch_markets['company_count']
dutch_markets = dutch_markets.sort_values('company_count', ascending=False)

print("=== TOP 10 DUTCH MARKETS ===")
print(f"\n{'Market':<30} {'Companies':>10} {'Total Funding':>15} {'Avg Funding':>15}")
print("-" * 75)
for idx, row in dutch_markets.head(10).iterrows():
    funding_str = f"${row['funding_usd']/1e6:.1f}M" if row['funding_usd'] > 0 else "$0"
    avg_str = f"${row['avg_funding']/1e6:.1f}M" if row['avg_funding'] > 0 else "N/A"
    print(f"{idx:<30} {int(row['company_count']):>10} {funding_str:>15} {avg_str:>15}")

## 4. Funding Round Progression Analysis ("Valley of Death")

In [None]:
# Analyze funding round progression
round_columns = ['seed', 'round_A', 'round_B', 'round_C', 'round_D', 'round_E', 'round_F', 'round_G', 'round_H']

# Count companies that received each round type
round_counts = {}
for col in round_columns:
    round_counts[col] = (df[col] > 0).sum()

print("=== FUNDING ROUND PROGRESSION (GLOBAL) ===")
print(f"\n{'Round Type':<15} {'Companies':>12} {'% of Total':>12} {'Drop-off':>12}")
print("-" * 55)
prev_count = len(df)
for col in round_columns:
    count = round_counts[col]
    pct = count / len(df) * 100
    dropoff = ((prev_count - count) / prev_count * 100) if prev_count > 0 else 0
    if count > 0:
        print(f"{col:<15} {count:>12,} {pct:>11.1f}% {dropoff:>11.1f}%")
        prev_count = count

In [None]:
# Funding round distribution
rounds_dist = df['funding_rounds'].value_counts().sort_index()

print("=== FUNDING ROUNDS DISTRIBUTION ===")
print(f"\n{'Rounds':>8} {'Companies':>12} {'Percentage':>12} {'Cumulative':>12}")
print("-" * 50)
cumulative = 0
for rounds, count in rounds_dist.head(10).items():
    pct = count / len(df) * 100
    cumulative += pct
    print(f"{int(rounds):>8} {count:>12,} {pct:>11.1f}% {cumulative:>11.1f}%")

single_round_pct = rounds_dist[1] / len(df) * 100 if 1 in rounds_dist.index else 0
print(f"\nKEY INSIGHT: {single_round_pct:.1f}% of startups never progress beyond their first funding round.")

In [None]:
# Compare Dutch vs Global round progression
dutch_rounds = dutch['funding_rounds'].value_counts().sort_index()
global_rounds = df['funding_rounds'].value_counts().sort_index()

print("=== DUTCH vs GLOBAL ROUND PROGRESSION ===")
print(f"\n{'Rounds':>8} {'Dutch %':>12} {'Global %':>12} {'Difference':>12}")
print("-" * 50)
for rounds in range(1, 6):
    dutch_pct = (dutch_rounds.get(rounds, 0) / len(dutch) * 100) if len(dutch) > 0 else 0
    global_pct = (global_rounds.get(rounds, 0) / len(df) * 100) if len(df) > 0 else 0
    diff = dutch_pct - global_pct
    print(f"{rounds:>8} {dutch_pct:>11.1f}% {global_pct:>11.1f}% {diff:>+11.1f}%")

## 5. Survival Patterns & Outcomes Analysis

In [None]:
# Analyze outcomes by funding rounds
outcome_by_rounds = df.groupby(['funding_rounds', 'status']).size().unstack(fill_value=0)

# Calculate percentages
outcome_pct = outcome_by_rounds.div(outcome_by_rounds.sum(axis=1), axis=0) * 100

print("=== OUTCOMES BY FUNDING ROUNDS ===")
print(f"\n{'Rounds':>8} {'Operating':>12} {'Acquired':>12} {'Closed':>12}")
print("-" * 50)
for rounds in range(1, 8):
    if rounds in outcome_pct.index:
        row = outcome_pct.loc[rounds]
        op = row.get('operating', 0)
        acq = row.get('acquired', 0)
        cl = row.get('closed', 0)
        print(f"{rounds:>8} {op:>11.1f}% {acq:>11.1f}% {cl:>11.1f}%")

print(f"\nKEY INSIGHT: Acquisition rate increases with more funding rounds.")
print(f"Companies with 1 round: {outcome_pct.loc[1].get('acquired', 0):.1f}% acquired")
print(f"Companies with 5+ rounds: {outcome_pct.loc[5:].mean().get('acquired', 0):.1f}% acquired (avg)")

In [None]:
# Average funding by outcome
outcome_funding = df.groupby('status').agg({
    'name': 'count',
    'funding_usd': ['sum', 'mean', 'median'],
    'funding_rounds': 'mean'
})
outcome_funding.columns = ['company_count', 'total_funding', 'avg_funding', 'median_funding', 'avg_rounds']

print("=== FUNDING BY OUTCOME ===")
print(f"\n{'Status':<12} {'Companies':>10} {'Avg Funding':>15} {'Median':>12} {'Avg Rounds':>12}")
print("-" * 65)
for status in ['operating', 'acquired', 'closed']:
    if status in outcome_funding.index:
        row = outcome_funding.loc[status]
        print(f"{status:<12} {int(row['company_count']):>10,} ${row['avg_funding']/1e6:>13.1f}M ${row['median_funding']/1e6:>10.1f}M {row['avg_rounds']:>11.2f}")

In [None]:
# Time series: Funding activity over years
yearly_funding = df.groupby('first_funding_year').agg({
    'name': 'count',
    'funding_usd': 'sum'
}).rename(columns={'name': 'company_count'})

# Filter to meaningful years (2005-2014)
yearly_funding = yearly_funding[(yearly_funding.index >= 2005) & (yearly_funding.index <= 2014)]

print("=== FUNDING ACTIVITY BY YEAR ===")
print(f"\n{'Year':>6} {'Companies':>12} {'Total Funding':>18} {'Avg Funding':>15}")
print("-" * 55)
for year, row in yearly_funding.iterrows():
    avg = row['funding_usd'] / row['company_count'] if row['company_count'] > 0 else 0
    print(f"{int(year):>6} {int(row['company_count']):>12,} ${row['funding_usd']/1e9:>16.2f}B ${avg/1e6:>13.1f}M")

## 6. Visualizations for Report

Creating 5 professional visualizations for the Ministry briefing.

In [None]:
# Set up professional color palette
techleap_colors = {
    'primary': '#1a365d',      # Dark blue
    'secondary': '#3182ce',    # Medium blue
    'accent': '#63b3ed',       # Light blue
    'success': '#38a169',      # Green
    'warning': '#d69e2e',      # Yellow/Orange
    'danger': '#e53e3e',       # Red
    'neutral': '#718096'       # Gray
}

# Define figure export settings
FIGURE_DIR = 'figures/'
DPI = 300

print("Visualization setup complete!")

In [None]:
# VISUALIZATION 1: Funding Round Progression Funnel
fig, ax = plt.subplots(figsize=(12, 7))

# Data for funnel
stages = ['Total\nCompanies', 'Seed', 'Series A', 'Series B', 'Series C', 'Series D']
counts = [
    len(df),
    (df['seed'] > 0).sum(),
    (df['round_A'] > 0).sum(),
    (df['round_B'] > 0).sum(),
    (df['round_C'] > 0).sum(),
    (df['round_D'] > 0).sum()
]

# Create horizontal bar chart (funnel-like)
colors = [techleap_colors['primary'], techleap_colors['secondary'], techleap_colors['accent'], 
          techleap_colors['success'], techleap_colors['warning'], techleap_colors['danger']]

bars = ax.barh(stages[::-1], counts[::-1], color=colors[::-1], height=0.6)

# Add value labels
for i, (bar, count) in enumerate(zip(bars, counts[::-1])):
    pct = count / counts[0] * 100
    ax.text(bar.get_width() + 500, bar.get_y() + bar.get_height()/2, 
            f'{count:,} ({pct:.1f}%)', va='center', fontsize=11, fontweight='bold')

ax.set_xlabel('Number of Companies', fontsize=12)
ax.set_title('The Startup Funding Funnel: Most Companies Never Reach Series A', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xlim(0, max(counts) * 1.25)

# Remove spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig(f'{FIGURE_DIR}01_funding_funnel.png', dpi=DPI, bbox_inches='tight', facecolor='white')
plt.show()
print("Figure 1 saved: 01_funding_funnel.png")

In [None]:
# VISUALIZATION 2: Company Outcomes by Funding Rounds
fig, ax = plt.subplots(figsize=(12, 7))

# Prepare data
rounds_to_plot = [1, 2, 3, 4, 5]
outcome_data = df[df['funding_rounds'].isin(rounds_to_plot)].groupby(['funding_rounds', 'status']).size().unstack(fill_value=0)
outcome_data_pct = outcome_data.div(outcome_data.sum(axis=1), axis=0) * 100

# Reorder columns for better visualization
status_order = ['operating', 'acquired', 'closed']
outcome_data_pct = outcome_data_pct[[s for s in status_order if s in outcome_data_pct.columns]]

# Create stacked bar chart
colors_status = [techleap_colors['secondary'], techleap_colors['success'], techleap_colors['danger']]
outcome_data_pct.plot(kind='bar', stacked=True, ax=ax, color=colors_status, width=0.7)

ax.set_xlabel('Number of Funding Rounds', fontsize=12)
ax.set_ylabel('Percentage of Companies', fontsize=12)
ax.set_title('Company Outcomes Improve with More Funding Rounds', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xticklabels([f'{int(r)} Round{"s" if r > 1 else ""}' for r in rounds_to_plot], rotation=0)
ax.legend(title='Status', labels=['Operating', 'Acquired', 'Closed'], loc='upper right')

# Add percentage labels
for i, rounds in enumerate(rounds_to_plot):
    acq_pct = outcome_data_pct.loc[rounds, 'acquired'] if 'acquired' in outcome_data_pct.columns else 0
    op_pct = outcome_data_pct.loc[rounds, 'operating'] if 'operating' in outcome_data_pct.columns else 0
    ax.annotate(f'{acq_pct:.1f}%', xy=(i, op_pct + acq_pct/2), ha='center', va='center', 
                fontsize=10, color='white', fontweight='bold')

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylim(0, 105)

plt.tight_layout()
plt.savefig(f'{FIGURE_DIR}02_outcomes_by_rounds.png', dpi=DPI, bbox_inches='tight', facecolor='white')
plt.show()
print("Figure 2 saved: 02_outcomes_by_rounds.png")

In [None]:
# VISUALIZATION 3: Netherlands vs Peer Countries
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Countries to compare
peer_countries = ['USA', 'GBR', 'DEU', 'NLD', 'ISR', 'FRA']
country_names = ['USA', 'UK', 'Germany', 'Netherlands', 'Israel', 'France']

# Calculate metrics for each country
peer_data = []
for country in peer_countries:
    country_df = df[df['country_code'] == country]
    peer_data.append({
        'country': country,
        'avg_funding': country_df['funding_usd'].mean() / 1e6,
        'operating_rate': (country_df['status'] == 'operating').mean() * 100,
        'avg_rounds': country_df['funding_rounds'].mean()
    })

peer_df = pd.DataFrame(peer_data)
peer_df['country_name'] = country_names

# Highlight Netherlands
colors_bars = [techleap_colors['primary'] if c != 'NLD' else techleap_colors['warning'] for c in peer_countries]

# Plot 1: Average funding
bars1 = axes[0].barh(peer_df['country_name'], peer_df['avg_funding'], color=colors_bars)
axes[0].set_xlabel('Average Funding ($ Millions)', fontsize=11)
axes[0].set_title('Average Funding per Company', fontsize=12, fontweight='bold')
for i, bar in enumerate(bars1):
    axes[0].text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2, 
                f'${peer_df.iloc[i]["avg_funding"]:.1f}M', va='center', fontsize=10)

# Plot 2: Average funding rounds
bars2 = axes[1].barh(peer_df['country_name'], peer_df['avg_rounds'], color=colors_bars)
axes[1].set_xlabel('Average Number of Funding Rounds', fontsize=11)
axes[1].set_title('Average Funding Rounds per Company', fontsize=12, fontweight='bold')
for i, bar in enumerate(bars2):
    axes[1].text(bar.get_width() + 0.02, bar.get_y() + bar.get_height()/2, 
                f'{peer_df.iloc[i]["avg_rounds"]:.2f}', va='center', fontsize=10)

for ax in axes:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

fig.suptitle('Netherlands Benchmarked Against Peer Innovation Ecosystems', 
             fontsize=14, fontweight='bold', y=1.02)

plt.tight_layout()
plt.savefig(f'{FIGURE_DIR}03_peer_benchmark.png', dpi=DPI, bbox_inches='tight', facecolor='white')
plt.show()
print("Figure 3 saved: 03_peer_benchmark.png")

In [None]:
# VISUALIZATION 4: Funding Activity Over Time
fig, ax1 = plt.subplots(figsize=(12, 6))

# Filter to meaningful years
yearly = yearly_funding[(yearly_funding.index >= 2005) & (yearly_funding.index <= 2014)].copy()

# Plot company count as bars
bars = ax1.bar(yearly.index, yearly['company_count'], color=techleap_colors['secondary'], 
               alpha=0.7, label='Companies Funded')
ax1.set_xlabel('Year', fontsize=12)
ax1.set_ylabel('Number of Companies Funded', color=techleap_colors['secondary'], fontsize=12)
ax1.tick_params(axis='y', labelcolor=techleap_colors['secondary'])

# Create second y-axis for total funding
ax2 = ax1.twinx()
line = ax2.plot(yearly.index, yearly['funding_usd']/1e9, color=techleap_colors['danger'], 
                linewidth=3, marker='o', markersize=8, label='Total Funding ($B)')
ax2.set_ylabel('Total Funding ($ Billions)', color=techleap_colors['danger'], fontsize=12)
ax2.tick_params(axis='y', labelcolor=techleap_colors['danger'])

# Add annotations for key events
ax1.annotate('2008 Financial\nCrisis', xy=(2008, yearly.loc[2008, 'company_count']), 
             xytext=(2006.5, yearly.loc[2008, 'company_count'] + 1500),
             arrowprops=dict(arrowstyle='->', color=techleap_colors['neutral']),
             fontsize=10, ha='center')

ax1.annotate('Peak Activity\n2013', xy=(2013, yearly.loc[2013, 'company_count']), 
             xytext=(2013, yearly.loc[2013, 'company_count'] + 1200),
             arrowprops=dict(arrowstyle='->', color=techleap_colors['neutral']),
             fontsize=10, ha='center')

ax1.set_title('Startup Funding Activity 2005-2014: Growth Despite Economic Cycles', 
              fontsize=14, fontweight='bold', pad=20)

# Combined legend
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

ax1.spines['top'].set_visible(False)
ax2.spines['top'].set_visible(False)

plt.tight_layout()
plt.savefig(f'{FIGURE_DIR}04_funding_timeline.png', dpi=DPI, bbox_inches='tight', facecolor='white')
plt.show()
print("Figure 4 saved: 04_funding_timeline.png")

In [None]:
# VISUALIZATION 5: Top Sectors - Company Count vs Funding
fig, ax = plt.subplots(figsize=(12, 8))

# Get top 12 sectors by company count
sector_data = df.groupby('market').agg({
    'name': 'count',
    'funding_usd': 'sum'
}).rename(columns={'name': 'company_count'})
sector_data['avg_funding'] = sector_data['funding_usd'] / sector_data['company_count']
top_sectors = sector_data.nlargest(12, 'company_count')

# Create scatter plot
scatter = ax.scatter(top_sectors['company_count'], 
                     top_sectors['funding_usd']/1e9,
                     s=top_sectors['avg_funding']/1e5,  # Size by avg funding
                     c=range(len(top_sectors)),
                     cmap='viridis',
                     alpha=0.7)

# Add sector labels
for idx, row in top_sectors.iterrows():
    label = idx[:20] + '...' if len(idx) > 20 else idx
    ax.annotate(label, 
                xy=(row['company_count'], row['funding_usd']/1e9),
                xytext=(5, 5), textcoords='offset points',
                fontsize=9)

ax.set_xlabel('Number of Companies', fontsize=12)
ax.set_ylabel('Total Funding ($ Billions)', fontsize=12)
ax.set_title('Sector Analysis: Biotechnology Dominates Funding Despite Fewer Companies', 
             fontsize=14, fontweight='bold', pad=20)

# Add note about bubble size
ax.text(0.02, 0.98, 'Bubble size = Average funding per company', 
        transform=ax.transAxes, fontsize=10, va='top', 
        style='italic', color=techleap_colors['neutral'])

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig(f'{FIGURE_DIR}05_sector_analysis.png', dpi=DPI, bbox_inches='tight', facecolor='white')
plt.show()
print("Figure 5 saved: 05_sector_analysis.png")

## 7. Key Findings Summary

In [None]:
# Calculate key statistics for the summary
print("=" * 70)
print("KEY FINDINGS FOR MINISTRY BRIEFING")
print("=" * 70)

# 1. The Valley of Death
single_round_pct = (df['funding_rounds'] == 1).sum() / len(df) * 100
seed_to_A = (df['round_A'] > 0).sum() / (df['seed'] > 0).sum() * 100 if (df['seed'] > 0).sum() > 0 else 0

print(f"\n1. THE VALLEY OF DEATH")
print(f"   - {single_round_pct:.1f}% of startups never progress beyond their first funding round")
print(f"   - Only {seed_to_A:.1f}% of seed-funded companies reach Series A")
print(f"   - This represents significant capital at risk and potential innovation lost")

# 2. Outcomes improve with rounds
acq_1round = outcome_pct.loc[1, 'acquired'] if 1 in outcome_pct.index else 0
acq_4round = outcome_pct.loc[4, 'acquired'] if 4 in outcome_pct.index else 0

print(f"\n2. MORE FUNDING = BETTER OUTCOMES")
print(f"   - Companies with 1 round: {acq_1round:.1f}% acquired, rest mostly still operating or closed")
print(f"   - Companies with 4 rounds: {acq_4round:.1f}% acquired")
print(f"   - Successful exits require sustained investment over multiple rounds")

# 3. Netherlands benchmark
nld_avg = dutch['funding_usd'].mean()
global_avg = df['funding_usd'].mean()
nld_op_rate = (dutch['status'] == 'operating').mean() * 100

print(f"\n3. NETHERLANDS POSITION")
print(f"   - 307 companies with ${dutch['funding_usd'].sum()/1e9:.1f}B total funding")
print(f"   - Average funding ${nld_avg/1e6:.1f}M vs global ${global_avg/1e6:.1f}M")
print(f"   - {nld_op_rate:.1f}% operating rate (above global average)")
print(f"   - Strong in Biotech, Software, and E-Commerce sectors")

# 4. Market cycles
print(f"\n4. MARKET RESILIENCE")
print(f"   - 2008 financial crisis caused a temporary dip in activity")
print(f"   - Strong recovery with peak activity in 2012-2013")
print(f"   - Startup ecosystems show resilience to economic shocks")

print(f"\n" + "=" * 70)
print("POLICY RECOMMENDATIONS")
print("=" * 70)
print("""
1. BRIDGE THE VALLEY: Create bridge funding programs for post-seed companies
   to improve Series A conversion rates.

2. SECTOR FOCUS: Continue support for Biotech and Software where Dutch 
   startups show competitive positioning.

3. FOLLOW-ON CAPITAL: Encourage existing investors to provide follow-on 
   funding rather than one-time investments.

4. EXIT PREPARATION: Support programs that prepare companies for acquisition
   or IPO, as successful exits attract more investment.
""")

In [None]:
# Data limitations and what's missing
print("=" * 70)
print("WHAT'S MISSING: LIMITATIONS OF THIS ANALYSIS")
print("=" * 70)
print("""
DATA GAPS:
- Investor identities: Who are the investors? What's the concentration?
- Time between rounds: How long do companies wait between funding?
- Revenue and employee growth: Funding alone doesn't measure success
- Exit valuations: Were acquisitions successful? At what multiples?
- Post-2014 data: This dataset ends in 2014

EXTERNAL FACTORS NOT CAPTURED:
- Regulatory environment changes
- Interest rate effects on risk appetite
- Industry-specific trends and disruptions
- Founder backgrounds and serial entrepreneur status

RECOMMENDED NEXT STEPS:
1. Integrate Dealroom data for more recent and complete Dutch coverage
2. Add KvK (Chamber of Commerce) data for company health metrics
3. Include patent and publication data for innovation metrics
4. Interview key investors and founders for qualitative insights
""")
print("\nAnalysis completed successfully!")