# Exploratory Data Analysis: Distributions

**Purpose**: Comprehensive distribution plots of all relevant data points for reference.

**Note**: This is legacy/supporting documentation. Key findings are in the main analysis.

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Style
plt.style.use('seaborn-v0_8-whitegrid')
COLORS = {
    'nl': '#D55E00', 'usa': '#0072B2', 'israel': '#009E73',
    'uk': '#56B4E9', 'germany': '#E69F00', 'france': '#CC79A7',
    'gray': '#999999', 'primary': '#D55E00'
}

os.makedirs('figures/distributions', exist_ok=True)

# Load data
df = pd.read_csv('../investments_VC.csv', encoding='latin-1')
df.columns = df.columns.str.strip()

# Clean
df['country_code'] = df['country_code'].str.strip()
df['market'] = df['market'].str.strip()
df['city'] = df['city'].str.strip()
df['status'] = df['status'].str.strip()

# Dates
df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
df['first_funding_at'] = pd.to_datetime(df['first_funding_at'], errors='coerce')
df['last_funding_at'] = pd.to_datetime(df['last_funding_at'], errors='coerce')

# Numeric
df['funding_total_usd'] = pd.to_numeric(
    df['funding_total_usd'].astype(str).str.replace(',','').str.replace(' ','').str.replace('-',''),
    errors='coerce'
)

numeric_cols = ['seed', 'round_A', 'round_B', 'round_C', 'round_D', 'venture', 'angel', 
                'grant', 'debt_financing', 'private_equity', 'funding_rounds']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Filter 2005-2014
df = df[df['first_funding_at'].notna()]
df = df[(df['first_funding_at'].dt.year >= 2005) & (df['first_funding_at'].dt.year <= 2014)]

# Derived columns
df['founded_year'] = df['founded_at'].dt.year
df['first_funding_year'] = df['first_funding_at'].dt.year
df['years_to_funding'] = (df['first_funding_at'] - df['founded_at']).dt.days / 365.25

print(f"Total companies: {len(df):,}")
print(f"Date range: {df['first_funding_at'].min().year} - {df['first_funding_at'].max().year}")

---
## 1. Geographic Distributions

In [None]:
# Country distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Top 20 countries
ax1 = axes[0]
country_counts = df['country_code'].value_counts().head(20)
colors = [COLORS['nl'] if c == 'NLD' else COLORS['gray'] for c in country_counts.index]
country_counts.plot(kind='barh', ax=ax1, color=colors)
ax1.set_xlabel('Number of Companies')
ax1.set_title('Top 20 Countries by Company Count', fontweight='bold')
ax1.invert_yaxis()

# Country share pie
ax2 = axes[1]
top5 = df['country_code'].value_counts().head(5)
other = df['country_code'].value_counts()[5:].sum()
pie_data = pd.concat([top5, pd.Series({'Other': other})])
pie_data.plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90)
ax2.set_ylabel('')
ax2.set_title('Company Distribution by Country', fontweight='bold')

plt.tight_layout()
plt.savefig('figures/distributions/country_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# NL city distribution
nl = df[df['country_code'] == 'NLD']

fig, ax = plt.subplots(figsize=(10, 6))
city_counts = nl['city'].value_counts().head(15)
city_counts.plot(kind='barh', ax=ax, color=COLORS['nl'], alpha=0.8)
ax.set_xlabel('Number of Companies')
ax.set_title('Dutch Startups by City', fontweight='bold')
ax.invert_yaxis()

plt.tight_layout()
plt.savefig('figures/distributions/nl_city_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nAmsterdam share: {city_counts.get('Amsterdam', 0) / len(nl) * 100:.1f}%")

---
## 2. Sector/Market Distributions

In [None]:
# Global sector distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Top 20 sectors globally
ax1 = axes[0]
sector_counts = df['market'].value_counts().head(20)
sector_counts.plot(kind='barh', ax=ax1, color=COLORS['gray'], alpha=0.8)
ax1.set_xlabel('Number of Companies')
ax1.set_title('Top 20 Sectors Globally', fontweight='bold')
ax1.invert_yaxis()

# NL sectors
ax2 = axes[1]
nl_sectors = nl['market'].value_counts().head(15)
nl_sectors.plot(kind='barh', ax=ax2, color=COLORS['nl'], alpha=0.8)
ax2.set_xlabel('Number of Companies')
ax2.set_title('Top 15 Dutch Sectors', fontweight='bold')
ax2.invert_yaxis()

plt.tight_layout()
plt.savefig('figures/distributions/sector_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Sector comparison: NL vs USA vs Israel
countries = ['NLD', 'USA', 'ISR']
country_names = ['Netherlands', 'USA', 'Israel']

fig, axes = plt.subplots(1, 3, figsize=(16, 6))

for i, (code, name) in enumerate(zip(countries, country_names)):
    ax = axes[i]
    subset = df[df['country_code'] == code]
    sectors = subset['market'].value_counts(normalize=True).head(10) * 100
    color = [COLORS['nl'], COLORS['usa'], COLORS['israel']][i]
    sectors.plot(kind='barh', ax=ax, color=color, alpha=0.8)
    ax.set_xlabel('% of Companies')
    ax.set_title(f'{name} Top 10 Sectors', fontweight='bold')
    ax.invert_yaxis()

plt.tight_layout()
plt.savefig('figures/distributions/sector_by_country.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 3. Funding Amount Distributions

In [None]:
# Total funding distribution (log scale)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Global distribution
ax1 = axes[0, 0]
valid_funding = df[df['funding_total_usd'] > 0]['funding_total_usd']
ax1.hist(np.log10(valid_funding), bins=50, color=COLORS['gray'], alpha=0.7, edgecolor='white')
ax1.set_xlabel('Log10(Total Funding USD)')
ax1.set_ylabel('Frequency')
ax1.set_title('Global: Total Funding Distribution (Log Scale)', fontweight='bold')
ax1.axvline(np.log10(valid_funding.median()), color='red', linestyle='--', label=f'Median: ${valid_funding.median()/1e6:.1f}M')
ax1.legend()

# By country boxplot
ax2 = axes[0, 1]
countries = ['NLD', 'USA', 'ISR', 'GBR', 'DEU', 'FRA']
country_data = [df[(df['country_code'] == c) & (df['funding_total_usd'] > 0)]['funding_total_usd'].apply(np.log10) for c in countries]
bp = ax2.boxplot(country_data, labels=['NL', 'USA', 'ISR', 'UK', 'DE', 'FR'], patch_artist=True)
for patch, color in zip(bp['boxes'], [COLORS['nl'], COLORS['usa'], COLORS['israel'], COLORS['uk'], COLORS['germany'], COLORS['france']]):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
ax2.set_ylabel('Log10(Total Funding USD)')
ax2.set_title('Funding Distribution by Country', fontweight='bold')

# NL funding histogram
ax3 = axes[1, 0]
nl_funding = nl[nl['funding_total_usd'] > 0]['funding_total_usd']
ax3.hist(np.log10(nl_funding), bins=30, color=COLORS['nl'], alpha=0.7, edgecolor='white')
ax3.set_xlabel('Log10(Total Funding USD)')
ax3.set_ylabel('Frequency')
ax3.set_title('NL: Total Funding Distribution', fontweight='bold')
ax3.axvline(np.log10(nl_funding.median()), color='black', linestyle='--', label=f'Median: ${nl_funding.median()/1e6:.1f}M')
ax3.legend()

# Funding by sector (NL)
ax4 = axes[1, 1]
top_sectors = nl['market'].value_counts().head(8).index
sector_funding = nl[nl['market'].isin(top_sectors)].groupby('market')['funding_total_usd'].median().sort_values(ascending=True) / 1e6
sector_funding.plot(kind='barh', ax=ax4, color=COLORS['nl'], alpha=0.8)
ax4.set_xlabel('Median Funding ($M)')
ax4.set_title('NL: Median Funding by Sector', fontweight='bold')

plt.tight_layout()
plt.savefig('figures/distributions/funding_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Round-specific funding distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

round_cols = [('seed', 'Seed'), ('round_A', 'Series A'), ('round_B', 'Series B'), ('round_C', 'Series C')]

for i, (col, name) in enumerate(round_cols):
    ax = axes[i // 2, i % 2]
    
    # Get valid data for each country
    for code, cname, color in [('NLD', 'NL', COLORS['nl']), ('USA', 'USA', COLORS['usa']), ('ISR', 'Israel', COLORS['israel'])]:
        data = df[(df['country_code'] == code) & (df[col] > 0)][col]
        if len(data) > 10:
            ax.hist(np.log10(data), bins=20, alpha=0.5, label=f'{cname} (n={len(data)})', color=color)
    
    ax.set_xlabel('Log10(Funding USD)')
    ax.set_ylabel('Frequency')
    ax.set_title(f'{name} Round Size Distribution', fontweight='bold')
    ax.legend()

plt.tight_layout()
plt.savefig('figures/distributions/round_size_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 4. Timing Distributions

In [None]:
# Time to first funding distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

valid_timing = df[(df['years_to_funding'] >= 0) & (df['years_to_funding'] <= 15)]

# Global distribution
ax1 = axes[0, 0]
ax1.hist(valid_timing['years_to_funding'], bins=30, color=COLORS['gray'], alpha=0.7, edgecolor='white')
ax1.set_xlabel('Years to First Funding')
ax1.set_ylabel('Frequency')
ax1.set_title('Global: Time to First Funding', fontweight='bold')
ax1.axvline(valid_timing['years_to_funding'].median(), color='red', linestyle='--', 
            label=f'Median: {valid_timing["years_to_funding"].median():.1f}y')
ax1.legend()

# By country
ax2 = axes[0, 1]
for code, name, color in [('NLD', 'NL', COLORS['nl']), ('USA', 'USA', COLORS['usa']), ('ISR', 'Israel', COLORS['israel'])]:
    data = valid_timing[valid_timing['country_code'] == code]['years_to_funding']
    ax2.hist(data, bins=20, alpha=0.5, label=f'{name} (med: {data.median():.1f}y)', color=color)
ax2.set_xlabel('Years to First Funding')
ax2.set_ylabel('Frequency')
ax2.set_title('Time to First Funding by Country', fontweight='bold')
ax2.legend()

# Funding rounds distribution
ax3 = axes[1, 0]
rounds_dist = df['funding_rounds'].value_counts().sort_index().head(10)
rounds_dist.plot(kind='bar', ax=ax3, color=COLORS['gray'], alpha=0.8)
ax3.set_xlabel('Number of Funding Rounds')
ax3.set_ylabel('Number of Companies')
ax3.set_title('Global: Funding Rounds Distribution', fontweight='bold')

# Funding rounds by country
ax4 = axes[1, 1]
for code, name, color in [('NLD', 'NL', COLORS['nl']), ('USA', 'USA', COLORS['usa']), ('ISR', 'Israel', COLORS['israel'])]:
    data = df[df['country_code'] == code]['funding_rounds']
    ax4.hist(data, bins=range(1, 12), alpha=0.5, label=f'{name} (avg: {data.mean():.1f})', color=color, density=True)
ax4.set_xlabel('Number of Funding Rounds')
ax4.set_ylabel('Density')
ax4.set_title('Funding Rounds by Country (Normalized)', fontweight='bold')
ax4.legend()

plt.tight_layout()
plt.savefig('figures/distributions/timing_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Founding year and funding year distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Founding year
ax1 = axes[0]
valid_founded = df[df['founded_year'].between(1990, 2014)]
for code, name, color in [('NLD', 'NL', COLORS['nl']), ('USA', 'USA', COLORS['usa'])]:
    data = valid_founded[valid_founded['country_code'] == code]['founded_year']
    ax1.hist(data, bins=range(1990, 2015), alpha=0.5, label=name, color=color, density=True)
ax1.set_xlabel('Founding Year')
ax1.set_ylabel('Density')
ax1.set_title('Company Founding Year Distribution', fontweight='bold')
ax1.legend()

# First funding year
ax2 = axes[1]
for code, name, color in [('NLD', 'NL', COLORS['nl']), ('USA', 'USA', COLORS['usa'])]:
    data = df[df['country_code'] == code]['first_funding_year']
    ax2.hist(data, bins=range(2005, 2015), alpha=0.5, label=name, color=color, density=True)
ax2.set_xlabel('First Funding Year')
ax2.set_ylabel('Density')
ax2.set_title('First Funding Year Distribution', fontweight='bold')
ax2.legend()

plt.tight_layout()
plt.savefig('figures/distributions/year_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 5. Status/Outcome Distributions

In [None]:
# Status distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Global
ax1 = axes[0]
status_counts = df['status'].value_counts()
colors = {'acquired': COLORS['israel'], 'operating': COLORS['gray'], 'closed': COLORS['nl'], 'ipo': COLORS['usa']}
status_counts.plot(kind='bar', ax=ax1, color=[colors.get(s, COLORS['gray']) for s in status_counts.index], alpha=0.8)
ax1.set_xlabel('Status')
ax1.set_ylabel('Number of Companies')
ax1.set_title('Global: Company Status Distribution', fontweight='bold')
ax1.tick_params(axis='x', rotation=45)

# By country (normalized)
ax2 = axes[1]
countries = ['NLD', 'USA', 'ISR', 'GBR', 'DEU', 'FRA']
status_by_country = df[df['country_code'].isin(countries)].groupby('country_code')['status'].value_counts(normalize=True).unstack() * 100
status_by_country[['acquired', 'operating', 'closed']].plot(kind='bar', ax=ax2, stacked=True, 
                                                             color=[COLORS['israel'], COLORS['gray'], COLORS['nl']], alpha=0.8)
ax2.set_xlabel('Country')
ax2.set_ylabel('% of Companies')
ax2.set_title('Status Distribution by Country', fontweight='bold')
ax2.legend(title='Status')
ax2.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig('figures/distributions/status_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Exit rate by sector (NL)
fig, ax = plt.subplots(figsize=(12, 6))

nl['exited'] = nl['status'].isin(['acquired', 'ipo'])
sector_exits = nl.groupby('market').agg({'exited': ['mean', 'count']}).round(3)
sector_exits.columns = ['exit_rate', 'n']
sector_exits = sector_exits[sector_exits['n'] >= 5].sort_values('exit_rate', ascending=True)
sector_exits['exit_rate'] *= 100

colors = [COLORS['israel'] if e > sector_exits['exit_rate'].median() else COLORS['gray'] for e in sector_exits['exit_rate']]
ax.barh(sector_exits.index, sector_exits['exit_rate'], color=colors, alpha=0.8)
ax.set_xlabel('Exit Rate (%)')
ax.set_title('NL: Exit Rate by Sector (nâ‰¥5)', fontweight='bold')

for i, (idx, row) in enumerate(sector_exits.iterrows()):
    ax.text(row['exit_rate'] + 0.5, i, f"n={row['n']:.0f}", va='center', fontsize=8)

plt.tight_layout()
plt.savefig('figures/distributions/nl_sector_exit_rates.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 6. Funding Type Distributions

In [None]:
# Funding type usage
funding_types = ['seed', 'venture', 'angel', 'grant', 'debt_financing', 'private_equity']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Global funding type presence
ax1 = axes[0, 0]
type_usage = pd.Series({ft: (df[ft] > 0).sum() / len(df) * 100 for ft in funding_types if ft in df.columns})
type_usage = type_usage.sort_values(ascending=True)
type_usage.plot(kind='barh', ax=ax1, color=COLORS['gray'], alpha=0.8)
ax1.set_xlabel('% of Companies')
ax1.set_title('Global: Funding Type Usage', fontweight='bold')

# By country
ax2 = axes[0, 1]
countries = ['NLD', 'USA', 'ISR']
country_type_usage = pd.DataFrame()
for code in countries:
    subset = df[df['country_code'] == code]
    usage = pd.Series({ft: (subset[ft] > 0).sum() / len(subset) * 100 for ft in funding_types if ft in df.columns})
    country_type_usage[code] = usage

country_type_usage.plot(kind='bar', ax=ax2, color=[COLORS['nl'], COLORS['usa'], COLORS['israel']], alpha=0.8)
ax2.set_xlabel('Funding Type')
ax2.set_ylabel('% of Companies')
ax2.set_title('Funding Type Usage by Country', fontweight='bold')
ax2.legend(title='Country')
ax2.tick_params(axis='x', rotation=45)

# Funding type amounts (median)
ax3 = axes[1, 0]
type_amounts = pd.Series({ft: df[df[ft] > 0][ft].median() / 1e6 for ft in funding_types if ft in df.columns})
type_amounts = type_amounts.sort_values(ascending=True)
type_amounts.plot(kind='barh', ax=ax3, color=COLORS['gray'], alpha=0.8)
ax3.set_xlabel('Median Amount ($M)')
ax3.set_title('Global: Median Funding by Type', fontweight='bold')

# Funding diversity distribution
ax4 = axes[1, 1]
df['funding_diversity'] = sum([(df[ft] > 0).astype(int) for ft in funding_types if ft in df.columns])
diversity_dist = df['funding_diversity'].value_counts().sort_index()
diversity_dist.plot(kind='bar', ax=ax4, color=COLORS['gray'], alpha=0.8)
ax4.set_xlabel('Number of Funding Types')
ax4.set_ylabel('Number of Companies')
ax4.set_title('Funding Type Diversity Distribution', fontweight='bold')

plt.tight_layout()
plt.savefig('figures/distributions/funding_type_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 7. Correlation Heatmaps

In [None]:
# Correlation heatmap
corr_cols = ['funding_total_usd', 'funding_rounds', 'seed', 'round_A', 'round_B', 
             'venture', 'angel', 'years_to_funding', 'funding_diversity']
corr_cols = [c for c in corr_cols if c in df.columns]

# Add outcome columns
df['exited'] = df['status'].isin(['acquired', 'ipo']).astype(int)
df['failed'] = (df['status'] == 'closed').astype(int)
corr_cols.extend(['exited', 'failed'])

corr_matrix = df[corr_cols].corr()

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdBu_r', center=0, 
            square=True, ax=ax, vmin=-1, vmax=1)
ax.set_title('Correlation Heatmap: Funding Variables vs Outcomes', fontweight='bold', pad=20)

plt.tight_layout()
plt.savefig('figures/distributions/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nKey correlations with EXIT:")
print(corr_matrix['exited'].sort_values(ascending=False))

---
## 8. Summary Statistics Tables

In [None]:
# Summary statistics by country
countries = ['NLD', 'USA', 'ISR', 'GBR', 'DEU', 'FRA']

summary = df[df['country_code'].isin(countries)].groupby('country_code').agg({
    'permalink': 'count',
    'funding_total_usd': ['mean', 'median'],
    'funding_rounds': 'mean',
    'years_to_funding': 'median',
    'exited': 'mean',
    'failed': 'mean'
}).round(3)

summary.columns = ['n_companies', 'mean_funding', 'median_funding', 'avg_rounds', 
                   'median_years_to_fund', 'exit_rate', 'fail_rate']
summary['mean_funding'] /= 1e6
summary['median_funding'] /= 1e6
summary['exit_rate'] *= 100
summary['fail_rate'] *= 100

print("SUMMARY STATISTICS BY COUNTRY")
print("="*80)
print(summary.round(2))

# Save to CSV
summary.to_csv('figures/distributions/summary_by_country.csv')

In [None]:
# NL detailed summary
print("\nNETHERLANDS DETAILED STATISTICS")
print("="*50)
print(f"Total companies: {len(nl)}")
print(f"\nFunding:")
print(f"  Mean total funding: ${nl['funding_total_usd'].mean()/1e6:.2f}M")
print(f"  Median total funding: ${nl['funding_total_usd'].median()/1e6:.2f}M")
print(f"  Avg funding rounds: {nl['funding_rounds'].mean():.2f}")
print(f"\nTiming:")
nl_timing = nl[(nl['years_to_funding'] >= 0) & (nl['years_to_funding'] <= 15)]
print(f"  Median years to first funding: {nl_timing['years_to_funding'].median():.2f}")
print(f"\nOutcomes:")
print(f"  Exit rate: {nl['exited'].mean()*100:.1f}%")
print(f"  Failure rate: {nl['failed'].mean()*100:.1f}%")
print(f"\nTop sectors:")
print(nl['market'].value_counts().head(5))
print(f"\nTop cities:")
print(nl['city'].value_counts().head(5))

---
## Distribution Plots Generated

| File | Contents |
|------|----------|
| `country_distribution.png` | Company count by country |
| `nl_city_distribution.png` | NL companies by city |
| `sector_distribution.png` | Global and NL sector breakdown |
| `sector_by_country.png` | Sector comparison: NL vs USA vs Israel |
| `funding_distributions.png` | Funding amount distributions |
| `round_size_distributions.png` | Seed/A/B/C round sizes |
| `timing_distributions.png` | Time to funding, rounds distributions |
| `year_distributions.png` | Founding and funding year |
| `status_distributions.png` | Company status by country |
| `nl_sector_exit_rates.png` | NL exit rates by sector |
| `funding_type_distributions.png` | Funding type usage and amounts |
| `correlation_heatmap.png` | Variable correlations |

---