# EDA
Perform initial exploration of the dataset to understand its structure, available features, missing values, and potential target variables for defining "success". This step will help us understand what we're working with and how to approach the problem.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import os
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

In [6]:
# Load the dataset
# id,Unnamed: 0.1,entity_type,entity_id,parent_id,name,normalized_name,permalink,category_code,status,founded_at,closed_at,domain,homepage_url,twitter_username,logo_url,logo_width,logo_height,short_description,description,overview,tag_list,country_code,state_code,city,region,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,created_by,created_at,updated_at,lat,lng,ROI
try:
    companies_df = pd.read_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival_large_files/companies.csv', sep=',', header=0)
    print(f"Dataset loaded successfully with {companies_df.shape[0]} rows and {companies_df.shape[1]} columns.")
except Exception as e:
    print(f"Error loading dataset: {e}")

Dataset loaded successfully with 196553 rows and 44 columns.


In [4]:
# Display the first few rows to understand the data structure
print("First 15 rows of the dataset:")
print(companies_df.head(15))

First 15 rows of the dataset:
          id  Unnamed: 0.1 entity_type  entity_id  parent_id                           name                normalized_name                               permalink    category_code     status  founded_at closed_at                domain                     homepage_url twitter_username                                           logo_url  logo_width  logo_height short_description                             description                                           overview                                           tag_list country_code state_code           city       region first_investment_at last_investment_at  investment_rounds  invested_companies first_funding_at last_funding_at  funding_rounds  funding_total_usd first_milestone_at last_milestone_at  milestones  relationships        created_by           created_at           updated_at   lat     lng   ROI
0        c:1             0     Company          1        NaN                       Wetpaint                

In [7]:
# Check basic information about the dataset
print("\nBasic information about the dataset:")
companies_df.info()


Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196553 entries, 0 to 196552
Data columns (total 44 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   196553 non-null  object 
 1   Unnamed: 0.1         196553 non-null  int64  
 2   entity_type          196553 non-null  object 
 3   entity_id            196553 non-null  int64  
 4   parent_id            0 non-null       float64
 5   name                 196530 non-null  object 
 6   normalized_name      196527 non-null  object 
 7   permalink            196553 non-null  object 
 8   category_code        123186 non-null  object 
 9   status               196553 non-null  object 
 10  founded_at           91227 non-null   object 
 11  closed_at            2620 non-null    object 
 12  domain               126545 non-null  object 
 13  homepage_url         126545 non-null  object 
 14  twitter_username     80591 non

In [8]:
# Check for duplicate rows
duplicate_count = companies_df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")


Number of duplicate rows: 0


In [9]:
# Examine data types and convert date columns to datetime format
print("\nData types before conversion:")
print(companies_df.dtypes)

# Identify date columns
date_columns = ['founded_at', 'closed_at', 'first_investment_at', 'last_investment_at', 
                'first_funding_at', 'last_funding_at', 'first_milestone_at', 
                'last_milestone_at', 'created_at', 'updated_at']

# Convert date columns to datetime
for col in date_columns:
    if col in companies_df.columns:
        companies_df[col] = pd.to_datetime(companies_df[col], errors='coerce')

print("\nData types after date conversion:")
print(companies_df.dtypes)


Data types before conversion:
id                      object
Unnamed: 0.1             int64
entity_type             object
entity_id                int64
parent_id              float64
name                    object
normalized_name         object
permalink               object
category_code           object
status                  object
founded_at              object
closed_at               object
domain                  object
homepage_url            object
twitter_username        object
logo_url                object
logo_width             float64
logo_height            float64
short_description       object
description             object
overview                object
tag_list                object
country_code            object
state_code              object
city                    object
region                  object
first_investment_at     object
last_investment_at      object
investment_rounds      float64
invested_companies     float64
first_funding_at        object
last_fun

In [10]:
# Check for missing values
missing_values = companies_df.isnull().sum()
missing_percentage = (missing_values / len(companies_df)) * 100

# Create a DataFrame to display missing values information
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

print("\nMissing values analysis:")
print(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Percentage', ascending=False))



Missing values analysis:
                     Missing Values  Missing Percentage
parent_id                    196553              100.00
ROI                          195827               99.63
first_investment_at          193970               98.69
last_investment_at           193970               98.69
investment_rounds            193962               98.68
invested_companies           193962               98.68
closed_at                    193933               98.67
short_description            189422               96.37
funding_total_usd            168679               85.82
first_funding_at             165046               83.97
last_funding_at              165046               83.97
funding_rounds               164846               83.87
state_code                   145650               74.10
twitter_username             115962               59.00
tag_list                     115101               58.56
lat                          112701               57.34
lng                   

In [11]:
# Generate summary statistics for numerical columns
numerical_summary = companies_df.describe(include=[np.number])
print("\nSummary statistics for numerical columns:")
print(numerical_summary)


Summary statistics for numerical columns:
       Unnamed: 0.1  entity_id  parent_id  logo_width  logo_height  investment_rounds  invested_companies  funding_rounds  funding_total_usd  milestones  relationships      lat      lng      ROI
count     196553.00  196553.00       0.00   110110.00    110110.00            2591.00             2591.00        31707.00           27874.00    91699.00      129667.00 83852.00 83852.00   726.00
mean       98276.00  153006.23        NaN      459.13       222.73               2.37                2.20            1.66        14816520.42        1.20           2.85    37.56   -52.12    45.75
std        56740.11   90209.25        NaN      594.98       333.09              12.17               11.44            1.20        67759366.56        0.54           9.10    15.48    70.05   572.04
min            0.00       1.00        NaN        1.00         1.00               1.00                1.00            1.00             291.00        1.00           1.00   -50.94 

In [12]:
# Analyze categorical columns
categorical_columns = companies_df.select_dtypes(include=['object']).columns
categorical_summary = pd.DataFrame({
    'Column': categorical_columns,
    'Unique Values': [companies_df[col].nunique() for col in categorical_columns],
    'Most Common Value': [companies_df[col].value_counts().index[0] if not companies_df[col].isna().all() and len(companies_df[col].value_counts()) > 0 else 'N/A' for col in categorical_columns],
    'Most Common Count': [companies_df[col].value_counts().iloc[0] if not companies_df[col].isna().all() and len(companies_df[col].value_counts()) > 0 else 0 for col in categorical_columns],
    'Missing Values': [companies_df[col].isna().sum() for col in categorical_columns],
    'Missing Percentage': [companies_df[col].isna().sum() / len(companies_df) * 100 for col in categorical_columns]
})

print("\nSummary of categorical columns:")
print(categorical_summary)


Summary of categorical columns:
               Column  Unique Values                                  Most Common Value  Most Common Count  Missing Values  Missing Percentage
0                  id         196553                                             c:9998                  1               0                0.00
1         entity_type              1                                            Company             196553               0                0.00
2                name         196347                                                  L                  8              23                0.01
3     normalized_name         195451                                                  l                  9              26                0.01
4           permalink         196512                                 /company/bboescape                  3               0                0.00
5       category_code             42                                           software              17922   

In [5]:
# Analyze the 'status' column
if 'status' in companies_df.columns:
    status_counts = companies_df['status'].value_counts()
    print("\nDistribution of company statuses:")
    print(status_counts)

    # Calculate percentage
    status_percentage = (status_counts / status_counts.sum() * 100).round(2)
    print("\nPercentage distribution of company statuses:")
    print(status_percentage)


Distribution of company statuses:
status
operating    183441
acquired       9394
closed         2584
ipo            1134
Name: count, dtype: int64

Percentage distribution of company statuses:
status
operating   93.33
acquired     4.78
closed       1.31
ipo          0.58
Name: count, dtype: float64


In [14]:
# Analyze the 'category_code' column to understand industry distribution
if 'category_code' in companies_df.columns:
    category_counts = companies_df['category_code'].value_counts().head(20)  # Top 20 categories
    print("\nTop 20 industry categories:")
    print(category_counts)


Top 20 industry categories:
category_code
software            17922
web                 15118
other               13617
ecommerce            9065
games_video          7520
mobile               6862
advertising          6098
consulting           5006
enterprise           4441
biotech              4430
hardware             2951
education            2901
public_relations     2846
network_hosting      2350
search               2182
cleantech            1940
health               1698
finance              1386
social               1310
security             1171
Name: count, dtype: int64


In [15]:
# Analyze funding information
funding_columns = ['funding_rounds', 'funding_total_usd']
for col in funding_columns:
    if col in companies_df.columns:
        print(f"\nSummary statistics for {col}:")
        print(companies_df[col].describe())



Summary statistics for funding_rounds:
count   31707.00
mean        1.66
std         1.20
min         1.00
25%         1.00
50%         1.00
75%         2.00
max        15.00
Name: funding_rounds, dtype: float64

Summary statistics for funding_total_usd:
count        27874.00
mean      14816520.42
std       67759366.56
min            291.00
25%         500000.00
50%        2564500.00
75%       11000000.00
max     5700000000.00
Name: funding_total_usd, dtype: float64


In [16]:
# Analyze ROI as a potential target variable
if 'ROI' in companies_df.columns:
    print("\nSummary statistics for ROI:")
    print(companies_df['ROI'].describe())

    # Count non-null ROI values
    roi_count = companies_df['ROI'].notna().sum()
    print(f"\nNumber of companies with ROI data: {roi_count} ({roi_count/len(companies_df)*100:.2f}%)")


Summary statistics for ROI:
count     726.00
mean       45.75
std       572.04
min         0.01
25%         2.65
50%         6.50
75%        13.55
max     13333.33
Name: ROI, dtype: float64

Number of companies with ROI data: 726 (0.37%)


In [17]:
# Analyze geographic distribution
geo_columns = ['country_code', 'state_code', 'city', 'region']
for col in geo_columns:
    if col in companies_df.columns and companies_df[col].notna().sum() > 0:
        top_locations = companies_df[col].value_counts().head(10)
        print(f"\nTop 10 {col} values:")
        print(top_locations)



Top 10 country_code values:
country_code
USA    51637
GBR     7372
IND     3924
CAN     3728
DEU     1921
FRA     1652
AUS     1455
ESP     1100
IRL     1079
ISR     1042
Name: count, dtype: int64

Top 10 state_code values:
state_code
CA    16489
NY     5732
MA     2937
TX     2811
FL     2154
WA     1895
IL     1742
PA     1322
NJ     1182
CO     1173
Name: count, dtype: int64

Top 10 city values:
city
New York         3816
San Francisco    3613
London           3030
Los Angeles      1068
Chicago          1026
Seattle           938
Austin            905
San Diego         810
Palo Alto         788
Toronto           739
Name: count, dtype: int64

Top 10 region values:
region
unknown          109866
SF Bay            10173
New York           5174
London             4065
Los Angeles        4057
Boston             2770
Washington DC      1802
Seattle            1620
Chicago            1611
San Diego          1196
Name: count, dtype: int64


In [18]:
# Create a comprehensive summary DataFrame for export
column_summary = []

for column in companies_df.columns:
    col_type = companies_df[column].dtype
    missing_count = companies_df[column].isna().sum()
    missing_pct = (missing_count / len(companies_df)) * 100

    if pd.api.types.is_numeric_dtype(companies_df[column]):
        unique_count = companies_df[column].nunique()
        min_val = companies_df[column].min() if not pd.isna(companies_df[column].min()) else None
        max_val = companies_df[column].max() if not pd.isna(companies_df[column].max()) else None
        mean_val = companies_df[column].mean() if not pd.isna(companies_df[column].mean()) else None
        most_common = None
        most_common_count = None
    else:
        unique_count = companies_df[column].nunique()
        min_val = None
        max_val = None
        mean_val = None
        if not companies_df[column].isna().all() and companies_df[column].value_counts().shape[0] > 0:
            most_common = str(companies_df[column].value_counts().index[0])
            most_common_count = companies_df[column].value_counts().iloc[0]
        else:
            most_common = None
            most_common_count = None

    column_summary.append({
        'Column': column,
        'Data Type': str(col_type),
        'Missing Values': missing_count,
        'Missing Percentage': missing_pct,
        'Unique Values': unique_count,
        'Min': min_val,
        'Max': max_val,
        'Mean': mean_val,
        'Most Common Value': most_common,
        'Most Common Count': most_common_count
    })

summary_df = pd.DataFrame(column_summary)
print("\nComprehensive column summary created successfully.")


Comprehensive column summary created successfully.


In [19]:
# Save the summary to CSV
summary_df.to_csv('eda_summary.csv', index=False)
print("Summary statistics saved to 'eda_summary.csv'")

Summary statistics saved to 'eda_summary.csv'


In [20]:
# Create visualizations for the PDF report
print("\nGenerating visualizations for the PDF report...")

# Function to create visualizations
def create_visualizations(df, pdf_path):
    with PdfPages(pdf_path) as pdf:
        # Set the style for all plots
        plt.style.use('seaborn-v0_8-whitegrid')

        # 1. Title page
        plt.figure(figsize=(8.5, 11))
        plt.text(0.5, 0.5, 'Exploratory Data Analysis\nCrunchbase Companies Dataset', 
                 horizontalalignment='center', verticalalignment='center', fontsize=24)
        plt.text(0.5, 0.4, f'Dataset contains {len(df):,} companies with {df.shape[1]} features', 
                 horizontalalignment='center', verticalalignment='center', fontsize=16)
        plt.axis('off')
        pdf.savefig()
        plt.close()

        # 2. Company status distribution
        if 'status' in df.columns:
            plt.figure(figsize=(10, 6))
            status_counts = df['status'].value_counts()
            status_counts.plot(kind='bar', color='skyblue')
            plt.title('Distribution of Company Statuses')
            plt.xlabel('Status')
            plt.ylabel('Count')
            plt.xticks(rotation=45)
            plt.tight_layout()
            pdf.savefig()
            plt.close()

            # Pie chart of status
            plt.figure(figsize=(10, 6))
            status_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, figsize=(10, 6))
            plt.title('Percentage Distribution of Company Statuses')
            plt.ylabel('')
            plt.tight_layout()
            pdf.savefig()
            plt.close()

        # 3. Category distribution (top 15)
        if 'category_code' in df.columns:
            plt.figure(figsize=(12, 6))
            top_categories = df['category_code'].value_counts().head(15)
            top_categories.plot(kind='bar', color='lightgreen')
            plt.title('Top 15 Industry Categories')
            plt.xlabel('Category')
            plt.ylabel('Count')
            plt.xticks(rotation=45)
            plt.tight_layout()
            pdf.savefig()
            plt.close()

        # 4. Funding rounds distribution
        if 'funding_rounds' in df.columns:
            plt.figure(figsize=(10, 6))
            # Filter out NaN values
            funding_data = df['funding_rounds'].dropna()
            sns.histplot(funding_data, bins=10, kde=True)
            plt.title('Distribution of Funding Rounds')
            plt.xlabel('Number of Funding Rounds')
            plt.ylabel('Count')
            plt.tight_layout()
            pdf.savefig()
            plt.close()

        # 5. Total funding distribution (log scale)
        if 'funding_total_usd' in df.columns:
            plt.figure(figsize=(10, 6))
            # Filter out NaN and zero values for log scale
            funding_total = df['funding_total_usd'].dropna()
            funding_total = funding_total[funding_total > 0]

            if len(funding_total) > 0:
                plt.hist(np.log10(funding_total), bins=20, color='salmon')
                plt.title('Distribution of Total Funding (Log Scale)')
                plt.xlabel('Log10(Total Funding in USD)')
                plt.ylabel('Count')
                plt.tight_layout()
                pdf.savefig()
                plt.close()

        # 6. ROI distribution
        if 'ROI' in df.columns:
            plt.figure(figsize=(10, 6))
            roi_data = df['ROI'].dropna()

            if len(roi_data) > 0:
                sns.histplot(roi_data, bins=20, kde=True)
                plt.title('Distribution of ROI')
                plt.xlabel('ROI')
                plt.ylabel('Count')
                plt.tight_layout()
                pdf.savefig()
                plt.close()

                # ROI boxplot
                plt.figure(figsize=(10, 6))
                sns.boxplot(x=roi_data)
                plt.title('Boxplot of ROI')
                plt.xlabel('ROI')
                plt.tight_layout()
                pdf.savefig()
                plt.close()

        # 7. Geographic distribution - Countries
        if 'country_code' in df.columns:
            plt.figure(figsize=(12, 6))
            top_countries = df['country_code'].value_counts().head(10)
            top_countries.plot(kind='bar', color='lightblue')
            plt.title('Top 10 Countries')
            plt.xlabel('Country Code')
            plt.ylabel('Count')
            plt.xticks(rotation=45)
            plt.tight_layout()
            pdf.savefig()
            plt.close()

        # 8. Company founding years distribution
        if 'founded_at' in df.columns:
            plt.figure(figsize=(12, 6))
            # Extract year from founded_at
            founded_years = df['founded_at'].dropna().dt.year

            if len(founded_years) > 0:
                plt.hist(founded_years, bins=30, color='lightgreen')
                plt.title('Distribution of Company Founding Years')
                plt.xlabel('Year')
                plt.ylabel('Count')
                plt.tight_layout()
                pdf.savefig()
                plt.close()

        # 9. Missing values visualization
        plt.figure(figsize=(14, 8))
        missing_data = df.isnull().sum() / len(df) * 100
        missing_data = missing_data[missing_data > 0].sort_values(ascending=False)

        if len(missing_data) > 0:
            missing_data.plot(kind='bar', color='coral')
            plt.title('Percentage of Missing Values by Column')
            plt.xlabel('Columns')
            plt.ylabel('Percentage Missing')
            plt.xticks(rotation=90)
            plt.tight_layout()
            pdf.savefig()
            plt.close()

        # 10. Correlation heatmap for numerical columns
        plt.figure(figsize=(14, 10))
        numerical_df = df.select_dtypes(include=[np.number])

        # Drop columns with all NaN values
        numerical_df = numerical_df.dropna(axis=1, how='all')

        if numerical_df.shape[1] > 1:  # Only create heatmap if we have at least 2 numerical columns
            correlation = numerical_df.corr()
            mask = np.triu(np.ones_like(correlation, dtype=bool))
            sns.heatmap(correlation, mask=mask, annot=True, fmt=".2f", cmap='coolwarm', 
                        linewidths=0.5, cbar_kws={"shrink": .8})
            plt.title('Correlation Heatmap of Numerical Features')
            plt.tight_layout()
            pdf.savefig()
            plt.close()

        # 11. Relationship between funding and ROI (if both exist)
        if 'funding_total_usd' in df.columns and 'ROI' in df.columns:
            plt.figure(figsize=(10, 6))
            # Filter for non-null values in both columns
            funding_roi_df = df[['funding_total_usd', 'ROI']].dropna()

            if len(funding_roi_df) > 0:
                # For better visualization, filter out extreme values
                q_low = funding_roi_df['ROI'].quantile(0.01)
                q_high = funding_roi_df['ROI'].quantile(0.99)
                filtered_df = funding_roi_df[(funding_roi_df['ROI'] >= q_low) & 
                                            (funding_roi_df['ROI'] <= q_high) & 
                                            (funding_roi_df['funding_total_usd'] > 0)]

                if len(filtered_df) > 0:
                    plt.scatter(np.log10(filtered_df['funding_total_usd']), filtered_df['ROI'], alpha=0.5)
                    plt.title('Relationship Between Total Funding and ROI')
                    plt.xlabel('Log10(Total Funding in USD)')
                    plt.ylabel('ROI')
                    plt.tight_layout()
                    pdf.savefig()
                    plt.close()

        # 12. Status by category (top 10 categories)
        if 'category_code' in df.columns and 'status' in df.columns:
            plt.figure(figsize=(14, 8))
            # Get top 10 categories
            top_cats = df['category_code'].value_counts().head(10).index

            # Filter for those categories
            cat_status_df = df[df['category_code'].isin(top_cats)]

            if len(cat_status_df) > 0:
                # Create a crosstab
                cat_status = pd.crosstab(cat_status_df['category_code'], cat_status_df['status'])

                # Plot stacked bar chart
                cat_status.plot(kind='bar', stacked=True, figsize=(14, 8))
                plt.title('Company Status by Top 10 Categories')
                plt.xlabel('Category')
                plt.ylabel('Count')
                plt.xticks(rotation=45)
                plt.legend(title='Status')
                plt.tight_layout()
                pdf.savefig()
                plt.close()

        print(f"PDF report with {pdf.get_pagecount()} pages created successfully.")

# Create the visualizations PDF
create_visualizations(companies_df, 'eda_visualizations.pdf')
print("Visualizations saved to 'eda_visualizations.pdf'")


Generating visualizations for the PDF report...
PDF report with 14 pages created successfully.
Visualizations saved to 'eda_visualizations.pdf'


<Figure size 1400x800 with 0 Axes>

In [21]:
# Final summary of findings
print("\n=== SUMMARY OF INITIAL DATA EXPLORATION ===")
print(f"Total number of companies in the dataset: {companies_df.shape[0]}")
print(f"Total number of features: {companies_df.shape[1]}")

# Count companies by status
if 'status' in companies_df.columns:
    print("\nCompany Status Distribution:")
    for status, count in companies_df['status'].value_counts().items():
        print(f"  - {status}: {count} companies ({count/len(companies_df)*100:.2f}%)")

# Funding summary
if 'funding_total_usd' in companies_df.columns:
    funded_companies = companies_df['funding_total_usd'].notna().sum()
    avg_funding = companies_df['funding_total_usd'].mean()
    print(f"\nFunding Summary:")
    print(f"  - Companies with funding data: {funded_companies} ({funded_companies/len(companies_df)*100:.2f}%)")
    print(f"  - Average funding amount: ${avg_funding:,.2f}")

# ROI summary
if 'ROI' in companies_df.columns:
    roi_companies = companies_df['ROI'].notna().sum()
    avg_roi = companies_df['ROI'].mean()
    print(f"\nROI Summary:")
    print(f"  - Companies with ROI data: {roi_companies} ({roi_companies/len(companies_df)*100:.2f}%)")
    print(f"  - Average ROI: {avg_roi:.2f}")

# Geographic summary
if 'country_code' in companies_df.columns:
    countries = companies_df['country_code'].notna().sum()
    top_country = companies_df['country_code'].value_counts().index[0] if companies_df['country_code'].value_counts().shape[0] > 0 else "Unknown"
    top_country_count = companies_df['country_code'].value_counts().iloc[0] if companies_df['country_code'].value_counts().shape[0] > 0 else 0
    print(f"\nGeographic Summary:")
    print(f"  - Companies with country data: {countries} ({countries/len(companies_df)*100:.2f}%)")
    print(f"  - Most common country: {top_country} with {top_country_count} companies")

print("\nPotential target variables for defining 'success':")
print("  1. Status (e.g., 'acquired', 'ipo' as success indicators)")
print("  2. ROI (Return on Investment)")
print("  3. Funding amount (total funding raised)")
print("  4. Longevity (time between founding and current/closing date)")
print("  5. Combination of the above factors")

print("\nEDA completed successfully. Results saved to 'eda_summary.csv' and 'eda_visualizations.pdf'")


=== SUMMARY OF INITIAL DATA EXPLORATION ===
Total number of companies in the dataset: 196553
Total number of features: 44

Company Status Distribution:
  - operating: 183441 companies (93.33%)
  - acquired: 9394 companies (4.78%)
  - closed: 2584 companies (1.31%)
  - ipo: 1134 companies (0.58%)

Funding Summary:
  - Companies with funding data: 27874 (14.18%)
  - Average funding amount: $14,816,520.42

ROI Summary:
  - Companies with ROI data: 726 (0.37%)
  - Average ROI: 45.75

Geographic Summary:
  - Companies with country data: 87990 (44.77%)
  - Most common country: USA with 51637 companies

Potential target variables for defining 'success':
  1. Status (e.g., 'acquired', 'ipo' as success indicators)
  2. ROI (Return on Investment)
  3. Funding amount (total funding raised)
  4. Longevity (time between founding and current/closing date)
  5. Combination of the above factors

EDA completed successfully. Results saved to 'eda_summary.csv' and 'eda_visualizations.pdf'
