# Chart Inventory & Code Gallery

**Goal**: 2 pages, 3 main findings for a senior data analyst role

This notebook contains the code for all charts so you can run and modify them.

In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# TechLeap Brand Colors
COLORS = {
    'primary': '#1C1C1C',
    'secondary': '#888888',
    'highlight_bad': '#FD5924',  # Orange - NL gap
    'highlight_good': '#02E4FF', # Cyan - Success
    'neutral': '#E8E8E8',
    'blue': '#5547FF',
    'orange': '#FD5924',
    'cyan': '#02E4FF',
    'purple': '#9441E9',
    'magenta': '#FF7AC9',
}

# Style
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['figure.facecolor'] = 'white'

# Load data
df = pd.read_csv('../data/crunchbase_filtered.csv', low_memory=False)
df['first_funding_at'] = pd.to_datetime(df['first_funding_at'], errors='coerce')
df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
df['funding_total_usd'] = pd.to_numeric(
    df['funding_total_usd'].astype(str).str.replace(',','').str.replace(' ','').str.replace('-',''),
    errors='coerce'
)

print(f"Loaded {len(df)} companies")
print(f"NL: {len(df[df['country_code']=='NLD'])} | USA: {len(df[df['country_code']=='USA'])} | Israel: {len(df[df['country_code']=='ISR'])}")

---

# TIER 1: YOUR TOP 3 CHARTS (Score 24/25)

Use these for your 2-page report.

## #1 - Graduation Gap (Score 24)
**NL 6.2% vs Israel 21% vs USA 16%** - The killer stat

In [None]:
# CHART 1: GRADUATION GAP - Seed to Series A conversion

def calc_seed_to_a(country_code):
    subset = df[df['country_code'] == country_code]
    has_seed = subset['funding_rounds'].str.contains('seed', case=False, na=False)
    has_a = subset['funding_rounds'].str.contains('series_a|a', case=False, na=False)
    seed_companies = subset[has_seed]
    graduated = seed_companies[has_a]
    return len(graduated) / len(seed_companies) * 100 if len(seed_companies) > 0 else 0

countries = ['NLD', 'DEU', 'GBR', 'FRA', 'USA', 'ISR']
labels = ['Netherlands', 'Germany', 'UK', 'France', 'USA', 'Israel']
rates = [calc_seed_to_a(c) for c in countries]

# Sort by rate
sorted_data = sorted(zip(labels, rates, countries), key=lambda x: x[1])
labels_sorted, rates_sorted, codes_sorted = zip(*sorted_data)

# Color: Orange for NL, Cyan for Israel, Grey for others
colors = [COLORS['orange'] if c == 'NLD' else (COLORS['cyan'] if c == 'ISR' else COLORS['secondary']) 
          for c in codes_sorted]

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(labels_sorted, rates_sorted, color=colors)

# Add value labels
for bar, rate in zip(bars, rates_sorted):
    ax.text(rate + 0.5, bar.get_y() + bar.get_height()/2, f'{rate:.1f}%', 
            va='center', fontsize=11, fontweight='bold')

ax.set_xlabel('Seed → Series A Conversion Rate (%)', fontsize=12)
ax.set_title('The Graduation Gap\nDutch startups fail to convert Seed to Series A', 
             fontsize=14, fontweight='bold', color=COLORS['primary'])

# Annotation
ax.text(0.95, 0.05, 'NL converts at 1/3 the rate of Israel', 
        transform=ax.transAxes, fontsize=10, ha='right', style='italic', color=COLORS['orange'])

plt.tight_layout()
plt.savefig('figures/finding1_graduation_gap.png', dpi=150, bbox_inches='tight')
plt.show()

## #2 - Rounds Matter (Score 24)
**4+ rounds = 3x higher acquisition rates** - Actionable

In [None]:
# CHART 2: ROUNDS MATTER - More rounds = better outcomes

df['is_acquired'] = df['status'] == 'acquired'
df['is_exited'] = df['status'].isin(['acquired', 'ipo'])

def outcomes_by_rounds(country_code):
    subset = df[df['country_code'] == country_code].copy()
    subset['round_bucket'] = pd.cut(subset['funding_rounds'].fillna(0).astype(int), 
                                     bins=[0, 1, 2, 3, 100], labels=['1', '2', '3', '4+'])
    return subset.groupby('round_bucket')['is_exited'].mean() * 100

nl_outcomes = outcomes_by_rounds('NLD')
usa_outcomes = outcomes_by_rounds('USA')
isr_outcomes = outcomes_by_rounds('ISR')

x = np.arange(4)
width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x - width, nl_outcomes.values, width, label='Netherlands', color=COLORS['orange'])
ax.bar(x, usa_outcomes.values, width, label='USA', color=COLORS['blue'])
ax.bar(x + width, isr_outcomes.values, width, label='Israel', color=COLORS['cyan'])

ax.set_xlabel('Number of Funding Rounds', fontsize=12)
ax.set_ylabel('Exit Rate (%)', fontsize=12)
ax.set_title('More Rounds = Better Outcomes\nCompanies with 4+ rounds have 3x higher exit rates', 
             fontsize=14, fontweight='bold', color=COLORS['primary'])
ax.set_xticks(x)
ax.set_xticklabels(['1 round', '2 rounds', '3 rounds', '4+ rounds'])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('figures/finding2_rounds_matter.png', dpi=150, bbox_inches='tight')
plt.show()

## #3 - Tortoise Effect (Score 24)
**Rushing (<2yr) = 2% exit vs Patient (5+yr) = 12%** - Counter-intuitive

In [None]:
# CHART 3: TORTOISE EFFECT - Patience pays off

df['bootstrap_years'] = (df['first_funding_at'] - df['founded_at']).dt.days / 365.25
df['timing_bucket'] = pd.cut(df['bootstrap_years'], 
                              bins=[-100, 2, 5, 100], 
                              labels=['<2 years (Rushed)', '2-5 years (Normal)', '5+ years (Patient)'])

def timing_outcomes(country_code):
    subset = df[df['country_code'] == country_code]
    return subset.groupby('timing_bucket')['is_exited'].mean() * 100

nl_timing = timing_outcomes('NLD')
usa_timing = timing_outcomes('USA')
isr_timing = timing_outcomes('ISR')

x = np.arange(3)
width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x - width, nl_timing.values, width, label='Netherlands', color=COLORS['orange'])
ax.bar(x, usa_timing.values, width, label='USA', color=COLORS['blue'])
ax.bar(x + width, isr_timing.values, width, label='Israel', color=COLORS['cyan'])

ax.set_xlabel('Time from Founding to First Funding', fontsize=12)
ax.set_ylabel('Exit Rate (%)', fontsize=12)
ax.set_title('The Tortoise Effect\nPatient companies outperform - rushing hurts NL most', 
             fontsize=14, fontweight='bold', color=COLORS['primary'])
ax.set_xticks(x)
ax.set_xticklabels(['<2 years\n(Rushed)', '2-5 years\n(Normal)', '5+ years\n(Patient)'])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Annotation
ax.annotate('', xy=(2, nl_timing.iloc[2]), xytext=(0, nl_timing.iloc[0]),
            arrowprops=dict(arrowstyle='->', color=COLORS['orange'], lw=2))

plt.tight_layout()
plt.savefig('figures/finding3_tortoise_effect.png', dpi=150, bbox_inches='tight')
plt.show()

---

# TIER 1: ADDITIONAL TOP CHARTS (Score 21-23)

## #4 - Funding Funnel (Score 23)
**Valley of Death** - NL loses companies at every stage

In [None]:
# CHART 4: FUNDING FUNNEL - Valley of Death

def stage_progression(country_code):
    subset = df[df['country_code'] == country_code]
    total = len(subset)
    has_seed = subset['funding_rounds'].str.contains('seed', case=False, na=False).sum()
    has_a = subset['funding_rounds'].str.contains('series_a|a', case=False, na=False).sum()
    has_b = subset['funding_rounds'].str.contains('series_b|b', case=False, na=False).sum()
    has_c = subset['funding_rounds'].str.contains('series_c|c', case=False, na=False).sum()
    exited = subset['status'].isin(['acquired', 'ipo']).sum()
    return [total, has_seed, has_a, has_b, has_c, exited]

stages = ['All Companies', 'Got Seed', 'Got Series A', 'Got Series B', 'Got Series C', 'Exited']

nl_funnel = stage_progression('NLD')
usa_funnel = stage_progression('USA')
isr_funnel = stage_progression('ISR')

# Normalize to 100
nl_pct = [x / nl_funnel[0] * 100 for x in nl_funnel]
usa_pct = [x / usa_funnel[0] * 100 for x in usa_funnel]
isr_pct = [x / isr_funnel[0] * 100 for x in isr_funnel]

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(stages, nl_pct, marker='o', linewidth=2.5, markersize=10, label='Netherlands', color=COLORS['orange'])
ax.plot(stages, usa_pct, marker='s', linewidth=2, markersize=8, label='USA', color=COLORS['blue'])
ax.plot(stages, isr_pct, marker='^', linewidth=2, markersize=8, label='Israel', color=COLORS['cyan'])

ax.set_ylabel('% of Companies Remaining', fontsize=12)
ax.set_title('The Valley of Death\nNL loses companies at every stage of the funnel', 
             fontsize=14, fontweight='bold', color=COLORS['primary'])
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(rotation=15)

# Shade the "Valley of Death"
ax.axvspan(1.5, 2.5, alpha=0.1, color=COLORS['orange'], label='Valley of Death')
ax.text(2, 50, 'Valley of\nDeath', ha='center', fontsize=10, color=COLORS['orange'], fontweight='bold')

plt.tight_layout()
plt.savefig('figures/sq1_1b_funding_funnel.png', dpi=150, bbox_inches='tight')
plt.show()

## #7 - NL Strengths (Score 21)
**What NL does well** - low failure rate, high operating rate

In [None]:
# CHART 7: NL STRENGTHS - What NL does well

def country_metrics(country_code):
    subset = df[df['country_code'] == country_code]
    total = len(subset)
    return {
        'Exit Rate': (subset['status'].isin(['acquired', 'ipo']).sum() / total) * 100,
        'Failure Rate': (subset['status'] == 'closed').sum() / total * 100,
        'Operating Rate': (subset['status'] == 'operating').sum() / total * 100,
    }

countries = ['NLD', 'USA', 'ISR', 'DEU', 'GBR']
labels = ['NL', 'USA', 'Israel', 'Germany', 'UK']
metrics = {c: country_metrics(c) for c in countries}

fig, axes = plt.subplots(1, 3, figsize=(14, 5))

for i, metric in enumerate(['Exit Rate', 'Failure Rate', 'Operating Rate']):
    ax = axes[i]
    values = [metrics[c][metric] for c in countries]
    
    # Color based on metric (green for good, red for bad)
    if metric == 'Failure Rate':
        colors = [COLORS['cyan'] if c == 'NLD' else COLORS['secondary'] for c in countries]
    elif metric == 'Operating Rate':
        colors = [COLORS['cyan'] if c == 'NLD' else COLORS['secondary'] for c in countries]
    else:
        colors = [COLORS['orange'] if c == 'NLD' else COLORS['secondary'] for c in countries]
    
    bars = ax.bar(labels, values, color=colors)
    ax.set_title(metric, fontsize=12, fontweight='bold')
    ax.set_ylabel('%')
    
    for bar, val in zip(bars, values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{val:.1f}%', 
                ha='center', fontsize=9)

fig.suptitle('What NL Does Well\nLow failure rate, high operating rate (but low exits)', 
             fontsize=14, fontweight='bold', y=1.02)

plt.tight_layout()
plt.savefig('figures/part0_strengths.png', dpi=150, bbox_inches='tight')
plt.show()

---

# TIER 2: SUPPORTING CHARTS (Score 17-20)

## #11 - Average Rounds (Score 20)
**NL avg 1.46 rounds vs USA 1.86** - fewer chances to grow

In [None]:
# CHART 11: AVERAGE ROUNDS - NL completes fewer rounds

countries = ['NLD', 'DEU', 'GBR', 'FRA', 'USA', 'ISR']
labels = ['Netherlands', 'Germany', 'UK', 'France', 'USA', 'Israel']

avg_rounds = [df[df['country_code'] == c]['funding_rounds'].mean() for c in countries]

# Sort
sorted_data = sorted(zip(labels, avg_rounds, countries), key=lambda x: x[1])
labels_sorted, rounds_sorted, codes_sorted = zip(*sorted_data)

colors = [COLORS['orange'] if c == 'NLD' else COLORS['secondary'] for c in codes_sorted]

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(labels_sorted, rounds_sorted, color=colors)

for bar, val in zip(bars, rounds_sorted):
    ax.text(val + 0.02, bar.get_y() + bar.get_height()/2, f'{val:.2f}', 
            va='center', fontsize=11, fontweight='bold')

ax.set_xlabel('Average Number of Funding Rounds', fontsize=12)
ax.set_title('Dutch Companies Complete Fewer Funding Rounds\nFewer chances to grow and scale', 
             fontsize=14, fontweight='bold', color=COLORS['primary'])

plt.tight_layout()
plt.savefig('figures/sq1_2c_avg_rounds.png', dpi=150, bbox_inches='tight')
plt.show()

## #13 - Round Sizes (Score 19)
**Seed/A/B round sizes by country** - NL rounds are smaller

In [None]:
# CHART 13: ROUND SIZES - NL gets smaller rounds

# This requires round-level data which may not be in the main dataset
# Showing total funding as a proxy

countries = ['NLD', 'DEU', 'GBR', 'USA', 'ISR']
labels = ['NL', 'Germany', 'UK', 'USA', 'Israel']

median_funding = [df[df['country_code'] == c]['funding_total_usd'].median() / 1e6 for c in countries]
mean_funding = [df[df['country_code'] == c]['funding_total_usd'].mean() / 1e6 for c in countries]

x = np.arange(len(labels))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width/2, median_funding, width, label='Median', color=COLORS['orange'])
bars2 = ax.bar(x + width/2, mean_funding, width, label='Mean', color=COLORS['cyan'])

ax.set_ylabel('Total Funding ($M)', fontsize=12)
ax.set_title('Total Funding by Country\nNL companies raise less overall', 
             fontsize=14, fontweight='bold', color=COLORS['primary'])
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('figures/sq1_2a_round_sizes.png', dpi=150, bbox_inches='tight')
plt.show()

## #15 - Sector Gap (Score 18)
**NL vs USA exit rate by sector**

In [None]:
# CHART 15: SECTOR GAP - Exit rates by sector

nl = df[df['country_code'] == 'NLD']
usa = df[df['country_code'] == 'USA']

# Top 8 sectors in NL
top_sectors = nl['market'].value_counts().head(8).index.tolist()

def sector_exit_rate(subset, sector):
    sector_df = subset[subset['market'] == sector]
    if len(sector_df) < 5:
        return np.nan
    return sector_df['status'].isin(['acquired', 'ipo']).mean() * 100

nl_rates = [sector_exit_rate(nl, s) for s in top_sectors]
usa_rates = [sector_exit_rate(usa, s) for s in top_sectors]
gaps = [u - n if not (np.isnan(u) or np.isnan(n)) else 0 for u, n in zip(usa_rates, nl_rates)]

# Sort by gap
sorted_data = sorted(zip(top_sectors, gaps), key=lambda x: x[1], reverse=True)
sectors_sorted, gaps_sorted = zip(*sorted_data)

colors = [COLORS['orange'] if g > 0 else COLORS['cyan'] for g in gaps_sorted]

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(sectors_sorted, gaps_sorted, color=colors)

ax.axvline(x=0, color=COLORS['primary'], linewidth=0.5)
ax.set_xlabel('USA Exit Rate - NL Exit Rate (pp)', fontsize=12)
ax.set_title('Sector Exit Rate Gap\nPositive = USA outperforms NL', 
             fontsize=14, fontweight='bold', color=COLORS['primary'])

for bar, gap in zip(bars, gaps_sorted):
    ax.text(gap + 0.3, bar.get_y() + bar.get_height()/2, f'{gap:+.1f}pp', 
            va='center', fontsize=10)

plt.tight_layout()
plt.savefig('figures/sq1_3_sector_gap.png', dpi=150, bbox_inches='tight')
plt.show()

---

# TIER 3: NICE-TO-HAVE (Score 14-16)

## Time Series Trends (Score 17)
**NL deal count, funding over time**

In [None]:
# TIME SERIES: NL trends over time

df['funding_year'] = df['first_funding_at'].dt.year
nl_data = df[df['country_code'] == 'NLD'].copy()

yearly = nl_data.groupby('funding_year').agg({
    'name': 'count',
    'funding_total_usd': ['sum', 'mean']
}).reset_index()
yearly.columns = ['year', 'deal_count', 'total_funding', 'avg_funding']
yearly = yearly[(yearly['year'] >= 2005) & (yearly['year'] <= 2014)]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Deal count
ax1 = axes[0]
ax1.plot(yearly['year'], yearly['deal_count'], marker='o', linewidth=2.5, 
         color=COLORS['orange'], markersize=8)
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of Deals')
ax1.set_title('NL Deal Count Over Time', fontweight='bold')
ax1.grid(True, alpha=0.3)

# Total funding
ax2 = axes[1]
ax2.bar(yearly['year'], yearly['total_funding'] / 1e6, color=COLORS['orange'], alpha=0.7)
ax2.set_xlabel('Year')
ax2.set_ylabel('Total Funding ($M)')
ax2.set_title('NL Total VC Funding by Year', fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('figures/finding8_time_series_trends.png', dpi=150, bbox_inches='tight')
plt.show()

---

# FINAL SUMMARY

## Your 3 Charts for the 2-Page Report:

| Rank | Chart | Score | Key Stat |
|------|-------|-------|----------|
| **#1** | finding1_graduation_gap.png | 24/25 | NL 6.2% vs Israel 21% |
| **#2** | finding2_rounds_matter.png | 24/25 | 4+ rounds = 3x better |
| **#3** | finding3_tortoise_effect.png | 24/25 | Rushing = 2% vs Patient = 12% |

## Key Numbers:

| Metric | NL | USA | Israel |
|--------|-----|-----|--------|
| Seed→A rate | **6.2%** | 15.8% | 21.4% |
| Exit rate | 4.3% | 8.9% | 8.5% |
| Avg rounds | 1.46 | 1.86 | 1.93 |

---

## Stop Analyzing. Start Writing.