# Data Cleaning - all companies
Clean the dataset by handling missing values, removing duplicates, correcting data types. This step ensures the data is ready for analysis and modeling. Missing values in the resulting dataframe were handled by replacing with 0.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
import os

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

In [5]:
companies_df = pd.read_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival_large_files/companies.csv', sep=',', header=0)

# Check for missing values
missing_values = companies_df.isnull().sum()
missing_percentage = (missing_values / len(companies_df)) * 100

# Create a DataFrame to display missing values information
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

print("\nMissing values analysis:")
print(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Percentage', ascending=False))



Missing values analysis:
                     Missing Values  Missing Percentage
parent_id                    196553              100.00
ROI                          195827               99.63
first_investment_at          193970               98.69
last_investment_at           193970               98.69
investment_rounds            193962               98.68
invested_companies           193962               98.68
closed_at                    193933               98.67
short_description            189422               96.37
funding_total_usd            168679               85.82
first_funding_at             165046               83.97
last_funding_at              165046               83.97
funding_rounds               164846               83.87
state_code                   145650               74.10
twitter_username             115962               59.00
tag_list                     115101               58.56
lat                          112701               57.34
lng                   

In [7]:
# Identify columns to drop based on high missing percentage or redundancy
print("\nIdentifying columns to drop...")

# Columns with very high missing values (>95%) except 'closed_at'
high_missing_cols = missing_info[missing_info['Missing Percentage'] > 95].index.tolist()
print(f"Columns with >95% missing values: {high_missing_cols}")

# Redundant identifier columns
redundant_cols = ['Unnamed: 0.1']  # This appears to be just a row index
print(f"Redundant columns: {redundant_cols}")

# Columns with limited analytical value and categorical cols
limited_value_cols = ['entity_id', 'id', 'created_by', 'lat', 'domain', 'twitter_username', 'homepage_url', 'permalink', 'normalized_name', 'lng', 'tag_list', 'overview', 'description', 'short_description', 'logo_url', 'logo_width', 'logo_height', 'parent_id']
print(f"Columns with limited analytical value: {limited_value_cols}")

# Combine all columns to drop
columns_to_drop = high_missing_cols + redundant_cols + limited_value_cols
print(f"\nTotal columns to drop: {len(columns_to_drop)}")
print(columns_to_drop)

# Drop the identified columns
df = pd.read_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival_large_files/companies.csv', sep=',', header=0)
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')
print(f"\nShape after dropping columns: {df_cleaned.shape}")


Identifying columns to drop...
Columns with >95% missing values: ['parent_id', 'short_description', 'first_investment_at', 'last_investment_at', 'investment_rounds', 'invested_companies', 'ROI']
Redundant columns: ['Unnamed: 0.1']
Columns with limited analytical value: ['entity_id', 'id', 'created_by', 'lat', 'domain', 'twitter_username', 'homepage_url', 'permalink', 'normalized_name', 'lng', 'tag_list', 'overview', 'description', 'short_description', 'logo_url', 'logo_width', 'logo_height', 'parent_id']

Total columns to drop: 26
['parent_id', 'short_description', 'first_investment_at', 'last_investment_at', 'investment_rounds', 'invested_companies', 'ROI', 'Unnamed: 0.1', 'entity_id', 'id', 'created_by', 'lat', 'domain', 'twitter_username', 'homepage_url', 'permalink', 'normalized_name', 'lng', 'tag_list', 'overview', 'description', 'short_description', 'logo_url', 'logo_width', 'logo_height', 'parent_id']

Shape after dropping columns: (196553, 20)


In [8]:
# Check for outliers in numeric columns
print("\nChecking for outliers in numeric columns...")

# Function to detect outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    return outliers.count()

# Check outliers in key numeric columns
numeric_cols_for_outliers = ['funding_total_usd', 'funding_rounds', 'relationships', 'age_years']
for col in numeric_cols_for_outliers:
    if col in df_cleaned.columns and df_cleaned[col].notnull().sum() > 0:
        outlier_count = detect_outliers(df_cleaned, col)
        print(f"Found {outlier_count} outliers in {col}")

# For funding_total_usd, cap extreme values at 99.9 percentile
if 'funding_total_usd' in df_cleaned.columns:
    cap_value = df_cleaned['funding_total_usd'].quantile(0.999)
    extreme_values = df_cleaned['funding_total_usd'] > cap_value
    if extreme_values.sum() > 0:
        print(f"Capping {extreme_values.sum()} extreme funding values at {cap_value}")
        df_cleaned.loc[extreme_values, 'funding_total_usd'] = cap_value


Checking for outliers in numeric columns...
Found 3468 outliers in funding_total_usd
Found 2369 outliers in funding_rounds
Found 9753 outliers in relationships
Capping 28 extreme funding values at 687250706.522175


In [9]:
# Calculate company age and add as a new column 'age'
# Define time_now as the end of 2014
time_now = pd.to_datetime('2014-12-31') # the last founding date in the dataset is 2014-09-30

# Convert date columns to datetime
df_cleaned['founded_at'] = pd.to_datetime(df_cleaned['founded_at'], errors='coerce')
df_cleaned['closed_at'] = pd.to_datetime(df_cleaned['closed_at'], errors='coerce')

# Create age column
df_cleaned['age'] = np.nan

# Calculate age for companies that have closed
closed_mask = ~df_cleaned['closed_at'].isna() & ~df_cleaned['founded_at'].isna()
df_cleaned.loc[closed_mask, 'age'] = (df_cleaned.loc[closed_mask, 'closed_at'] - 
                                     df_cleaned.loc[closed_mask, 'founded_at']).dt.days / 365.25

# Calculate age for companies still operating
operating_mask = df_cleaned['closed_at'].isna() & ~df_cleaned['founded_at'].isna()
df_cleaned.loc[operating_mask, 'age'] = (time_now - 
                                        df_cleaned.loc[operating_mask, 'founded_at']).dt.days / 365.25

# Calculate median age for companies with valid age
median_age = df_cleaned['age'].median()

# Handle case where median might be NaN
if pd.isna(median_age):
    median_age = 5.0  # Default reasonable value
    print(f"No valid ages found. Using default median age of {median_age:.2f} years")
else:
    print(f"Using median age of {median_age:.2f} years for companies with missing founding dates")

# Assign median age to companies with missing founded_at
missing_founded_mask = df_cleaned['founded_at'].isna()
df_cleaned.loc[missing_founded_mask, 'age'] = median_age

# Round age to 2 decimal places for readability
df_cleaned['age'] = df_cleaned['age'].round(2)

# Display age statistics
print("\nCompany age statistics:")
print(df_cleaned['age'].describe())

# Count of companies by age calculation method
print("\nAge calculation breakdown:")
print(f"Companies with closed date: {closed_mask.sum()}")
print(f"Active companies with founding date: {operating_mask.sum()}")
print(f"Companies using median age: {missing_founded_mask.sum()}")

Using median age of 6.00 years for companies with missing founding dates

Company age statistics:
count   196553.00
mean         7.37
std          6.91
min        -40.31
25%          6.00
50%          6.00
75%          6.00
max        114.00
Name: age, dtype: float64

Age calculation breakdown:
Companies with closed date: 2011
Active companies with founding date: 89216
Companies using median age: 105326


In [10]:
# Find all rows with negative age values
negative_age_rows = df_cleaned[df_cleaned['age'] < 0]

# Display the count
print(f"Found {len(negative_age_rows)} rows with negative age values")

# Display the negative age rows
print("\nRows with negative age values:")
print(negative_age_rows)

# Optional: Display just the key date columns to understand the issue
if not negative_age_rows.empty:
    print("\nKey date information for negative age rows:")
    for idx, row in negative_age_rows.iterrows():
        print(f"\nRow index: {idx}")
        print(f"Company name: {row.get('name', 'N/A')}")
        print(f"Age: {row['age']}")
        print(f"Founded at: {row['founded_at']}")
        print(f"Closed at: {row['closed_at']}")

Found 44 rows with negative age values

Rows with negative age values:
       entity_type                          name    category_code    status founded_at  closed_at country_code state_code           city              region first_funding_at last_funding_at  funding_rounds  funding_total_usd first_milestone_at last_milestone_at  milestones  relationships           created_at           updated_at    age
936        Company                         Peers              web    closed 2011-09-01 2011-01-01          NaN        NaN            NaN             unknown              NaN             NaN             NaN                NaN                NaN               NaN         NaN            NaN  2011-09-14 18:25:19  2013-08-06 21:44:09  -0.67
3624       Company                        Zooomr              web    closed 2006-03-01 2006-01-01          USA         CA  San Francisco              SF Bay       2006-02-01      2006-02-01            1.00           50000.00                NaN          

In [11]:
# Removing companies with negative age values
# Find rows with negative age values
negative_age_rows = df_cleaned[df_cleaned['age'] < 0]
print(f"Found {len(negative_age_rows)} rows with negative age values")

# Store the original length of the dataset
original_length = len(df_cleaned)

# Remove rows with negative age values
df_cleaned = df_cleaned[df_cleaned['age'] >= 0]

# Confirm removal
print(f"Removed {original_length - len(df_cleaned)} rows with negative ages")
print(f"Dataset now has {len(df_cleaned)} rows")
print(f"New minimum age: {df_cleaned['age'].min()}")

# Display age statistics after removal
print("\nCompany age statistics after removing negative values:")
print(df_cleaned['age'].describe())

Found 44 rows with negative age values
Removed 44 rows with negative ages
Dataset now has 196509 rows
New minimum age: 0.0

Company age statistics after removing negative values:
count   196509.00
mean         7.37
std          6.91
min          0.00
25%          6.00
50%          6.00
75%          6.00
max        114.00
Name: age, dtype: float64


In [12]:
# Select only the specified columns
columns_to_keep = ['status', 'funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'age']

# Create a new dataframe with only these columns
df_cleaned = df_cleaned[columns_to_keep]

# Display information about the resulting dataframe
print(f"DataFrame now has {len(df_cleaned)} rows and {len(df_cleaned.columns)} columns")
print("\nColumns in the dataframe:")
print(df_cleaned.columns.tolist())

# Display first few rows of the resulting dataframe
print("\nFirst 5 rows of the dataframe:")
print(df_cleaned.head())

# Display basic statistics
print("\nSummary statistics:")
print(df_cleaned.describe())

DataFrame now has 196509 rows and 6 columns

Columns in the dataframe:
['status', 'funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'age']

First 5 rows of the dataframe:
      status  funding_rounds  funding_total_usd  milestones  relationships  age
0  operating            3.00        39750000.00        5.00          17.00 9.20
1   acquired             NaN                NaN         NaN           6.00 6.00
2   acquired             NaN                NaN        4.00          12.00 6.00
3  operating             NaN                NaN         NaN            NaN 6.43
4  operating             NaN                NaN         NaN            NaN 6.43

Summary statistics:
       funding_rounds  funding_total_usd  milestones  relationships       age
count        31678.00           27852.00    91674.00      129637.00 196509.00
mean             1.66        14054633.06        1.20           2.85      7.37
std              1.20        42210824.21        0.54           9.10      6

In [13]:
# Save the cleaned dataset
print("\nSaving cleaned dataset...")
df_cleaned.to_csv('all_cleaned_companies.csv', index=False)
print("Saved cleaned dataset to 'all_cleaned_companies.csv'")


Saving cleaned dataset...
Saved cleaned dataset to 'all_cleaned_companies.csv'


Printing the missing percentages from the targeted columns

In [14]:
# Check for missing values
missing_values = df_cleaned.isnull().sum()
missing_percentage = (missing_values / len(df_cleaned)) * 100

# Create a DataFrame to display missing values information
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

print("\nMissing values analysis:")
print(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Percentage', ascending=False))



Missing values analysis:
                   Missing Values  Missing Percentage
funding_total_usd          168657               85.83
funding_rounds             164831               83.88
milestones                 104835               53.35
relationships               66872               34.03


In [15]:
df_cleaned.head(5)

Unnamed: 0,status,funding_rounds,funding_total_usd,milestones,relationships,age
0,operating,3.0,39750000.0,5.0,17.0,9.2
1,acquired,,,,6.0,6.0
2,acquired,,,4.0,12.0,6.0
3,operating,,,,,6.43
4,operating,,,,,6.43


In [16]:
df_cleaned['funding_total_usd'] = df_cleaned['funding_total_usd'].fillna(0)
df_cleaned['funding_rounds'] = df_cleaned['funding_rounds'].fillna(0)

In [17]:
df_cleaned.head(10)

Unnamed: 0,status,funding_rounds,funding_total_usd,milestones,relationships,age
0,operating,3.0,39750000.0,5.0,17.0,9.2
1,acquired,0.0,0.0,,6.0,6.0
2,acquired,0.0,0.0,4.0,12.0,6.0
3,operating,0.0,0.0,,,6.43
4,operating,0.0,0.0,,,6.43
5,operating,0.0,0.0,,2.0,7.51
6,operating,0.0,0.0,,,6.36
7,operating,0.0,0.0,1.0,1.0,6.0
8,operating,0.0,0.0,1.0,2.0,6.0
9,operating,0.0,0.0,1.0,2.0,3.42


In [18]:
# Check for missing values
missing_values = df_cleaned.isnull().sum()
missing_percentage = (missing_values / len(df_cleaned)) * 100

# Create a DataFrame to display missing values information
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

print("\nMissing values analysis:")
print(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Percentage', ascending=False))



Missing values analysis:
               Missing Values  Missing Percentage
milestones             104835               53.35
relationships           66872               34.03


In [19]:
df_cleaned['milestones'] = df_cleaned['milestones'].fillna(0)

In [20]:
# Display information about the resulting dataframe
print(f"DataFrame now has {len(df_cleaned)} rows and {len(df_cleaned.columns)} columns")
print("\nColumns in the dataframe:")
print(df_cleaned.columns.tolist())

# Display first few rows of the resulting dataframe
print("\nFirst 5 rows of the dataframe:")
print(df_cleaned.head())

# Display basic statistics
print("\nSummary statistics:")
print(df_cleaned.describe())

DataFrame now has 196509 rows and 6 columns

Columns in the dataframe:
['status', 'funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'age']

First 5 rows of the dataframe:
      status  funding_rounds  funding_total_usd  milestones  relationships  age
0  operating            3.00        39750000.00        5.00          17.00 9.20
1   acquired            0.00               0.00        0.00           6.00 6.00
2   acquired            0.00               0.00        4.00          12.00 6.00
3  operating            0.00               0.00        0.00            NaN 6.43
4  operating            0.00               0.00        0.00            NaN 6.43

Summary statistics:
       funding_rounds  funding_total_usd  milestones  relationships       age
count       196509.00          196509.00   196509.00      129637.00 196509.00
mean             0.27         1992018.89        0.56           2.85      7.37
std              0.78        16629982.32        0.70           9.10      6

In [21]:
df_cleaned['relationships'] = df_cleaned['relationships'].fillna(0)

In [22]:
# Check for missing values
missing_values = df_cleaned.isnull().sum()
missing_percentage = (missing_values / len(df_cleaned)) * 100

# Create a DataFrame to display missing values information
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

print("\nMissing values analysis:")
print(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Percentage', ascending=False))



Missing values analysis:
Empty DataFrame
Columns: [Missing Values, Missing Percentage]
Index: []


In [23]:
# Display information about the resulting dataframe
print(f"DataFrame now has {len(df_cleaned)} rows and {len(df_cleaned.columns)} columns")
print("\nColumns in the dataframe:")
print(df_cleaned.columns.tolist())

# Display first few rows of the resulting dataframe
print("\nFirst 5 rows of the dataframe:")
print(df_cleaned.head())

# Display basic statistics
print("\nSummary statistics:")
print(df_cleaned.describe())

DataFrame now has 196509 rows and 6 columns

Columns in the dataframe:
['status', 'funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'age']

First 5 rows of the dataframe:
      status  funding_rounds  funding_total_usd  milestones  relationships  age
0  operating            3.00        39750000.00        5.00          17.00 9.20
1   acquired            0.00               0.00        0.00           6.00 6.00
2   acquired            0.00               0.00        4.00          12.00 6.00
3  operating            0.00               0.00        0.00           0.00 6.43
4  operating            0.00               0.00        0.00           0.00 6.43

Summary statistics:
       funding_rounds  funding_total_usd  milestones  relationships       age
count       196509.00          196509.00   196509.00      196509.00 196509.00
mean             0.27         1992018.89        0.56           1.88      7.37
std              0.78        16629982.32        0.70           7.51      6

In [24]:
# Display final dataset information
print("\nFinal dataset information:")
print(df_cleaned.info())

# Display sample of cleaned data
print("\nSample of cleaned data:")
print(df_cleaned.head())

# Display success metrics distribution
print("\nSuccess metrics distribution:")
if 'success_binary' in df_cleaned.columns:
    print(f"Success binary distribution:\n{df_cleaned['success_binary'].value_counts(dropna=False)}")
if 'success_class' in df_cleaned.columns:
    print(f"Success class distribution:\n{df_cleaned['success_class'].value_counts(dropna=False)}")


Final dataset information:
<class 'pandas.core.frame.DataFrame'>
Index: 196509 entries, 0 to 196552
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   status             196509 non-null  object 
 1   funding_rounds     196509 non-null  float64
 2   funding_total_usd  196509 non-null  float64
 3   milestones         196509 non-null  float64
 4   relationships      196509 non-null  float64
 5   age                196509 non-null  float64
dtypes: float64(5), object(1)
memory usage: 10.5+ MB
None

Sample of cleaned data:
      status  funding_rounds  funding_total_usd  milestones  relationships  age
0  operating            3.00        39750000.00        5.00          17.00 9.20
1   acquired            0.00               0.00        0.00           6.00 6.00
2   acquired            0.00               0.00        4.00          12.00 6.00
3  operating            0.00               0.00        0.00           0.

In [25]:
# Save the final dataset
print("\nSaving final dataset...")
df_cleaned.to_csv('all_final_companies.csv', index=False)
print("Saved final dataset to 'all_final_companies.csv'")


Saving final dataset...
Saved final dataset to 'all_final_companies.csv'
