# Data Cleaning - USA companies
Filter the datatset on the 'country_code' column to retain only USA companies. Clean the data, handle missing values.

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
import os

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
# filter the companies.csv file to retain only companies with country_code = 'USA'
companies_df = pd.read_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival_large_files/companies.csv', sep=',', header=0)
def filter_companies_by_country(companies_df, country_code='USA'):
    # Filter the DataFrame for the specified country code
    filtered_companies = companies_df[companies_df['country_code'] == country_code]
    return filtered_companies
# save the filtered DataFrame to a file usa_companies.csv
def save_filtered_companies_to_csv(filename='usa_companies.csv'):
    # Save the filtered DataFrame to a CSV file
    filetered_companies = filter_companies_by_country(companies_df, country_code='USA')
    filetered_companies.to_csv(filename, index=False)

save_filtered_companies_to_csv()

In [4]:
pd.read_csv('usa_companies.csv').head()

Unnamed: 0,id,Unnamed: 0.1,entity_type,entity_id,parent_id,name,normalized_name,permalink,category_code,status,founded_at,closed_at,domain,homepage_url,twitter_username,logo_url,logo_width,logo_height,short_description,description,overview,tag_list,country_code,state_code,city,region,first_investment_at,last_investment_at,investment_rounds,invested_companies,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,created_by,created_at,updated_at,lat,lng,ROI
0,c:1,0,Company,1,,Wetpaint,wetpaint,/company/wetpaint,web,operating,2005-10-17,,wetpaint-inc.com,http://wetpaint-inc.com,BachelrWetpaint,http://s3.amazonaws.com/crunchbase_prod_assets...,401.0,54.0,,Technology Platform Company,Wetpaint is a technology platform company that...,"wiki, seattle, elowitz, media-industry, media-...",USA,WA,Seattle,Seattle,,,,,2005-10-01,2008-05-19,3.0,39750000.0,2010-09-05,2013-09-18,5.0,17.0,initial-importer,2007-05-25 06:51:27,2013-04-13 03:29:00,47.61,-122.33,15.5
1,c:10,1,Company,10,,Flektor,flektor,/company/flektor,games_video,acquired,,,flektor.com,http://www.flektor.com,,http://s3.amazonaws.com/crunchbase_prod_assets...,186.0,85.0,,,Flektor is a rich-media mash-up platform that ...,"flektor, photo, video",USA,CA,Culver City,Los Angeles,,,,,,,,,,,,6.0,initial-importer,2007-05-31 21:11:51,2008-05-23 23:23:14,34.02,-118.4,
2,c:100,2,Company,100,,There,there,/company/there,games_video,acquired,,,there.com,http://www.there.com,,http://s3.amazonaws.com/crunchbase_prod_assets...,107.0,34.0,,,There.com is an online virtual world where any...,"virtualworld, there, teens",USA,CA,San Mateo,SF Bay,,,,,,,,,2003-02-01,2011-09-23,4.0,12.0,initial-importer,2007-08-06 23:52:45,2013-11-04 02:09:48,37.56,-122.33,
3,c:10005,8,Company,10005,,Thomas Publishing,thomas publishing,/company/thomas-publishing,advertising,operating,,,thomaspublishing.com,http://www.thomaspublishing.com,,http://s3.amazonaws.com/crunchbase_prod_assets...,276.0,47.0,,Online Media,"For more than a century, Thomas Publishing has...",,USA,NY,New York,New York,,,,,,,,,1999-06-01,1999-06-01,1.0,2.0,,2008-08-24 20:21:21,2009-11-19 17:21:00,40.71,-74.01,
4,c:10009,12,Company,10009,,dimension5 labs,dimension5 labs,/company/dimension5-labs,advertising,operating,2008-08-01,,d5labs.com,http://d5labs.com,,http://s3.amazonaws.com/crunchbase_prod_assets...,300.0,85.0,,,dimension5 labs is a full service digital adve...,"virtual-worlds, advertising-agency, complex-pr...",USA,NM,Santa Fe,Santa Fe,,,,,,,,,2008-08-22,2008-08-22,1.0,2.0,,2008-08-24 21:54:55,2008-12-21 17:21:53,35.69,-105.94,


In [5]:
companies_df = pd.read_csv('usa_companies.csv', sep=',', header=0)

# Check for missing values
missing_values = companies_df.isnull().sum()
missing_percentage = (missing_values / len(companies_df)) * 100

# Create a DataFrame to display missing values information
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

print("\nMissing values analysis:")
print(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Percentage', ascending=False))



Missing values analysis:
                     Missing Values  Missing Percentage
parent_id                     51637              100.00
ROI                           51057               98.88
first_investment_at           50886               98.55
last_investment_at            50886               98.55
invested_companies            50884               98.54
investment_rounds             50884               98.54
closed_at                     50258               97.33
short_description             46808               90.65
funding_total_usd             33495               64.87
last_funding_at               32004               61.98
first_funding_at              32004               61.98
funding_rounds                31905               61.79
last_milestone_at             24967               48.35
first_milestone_at            24967               48.35
milestones                    24967               48.35
tag_list                      22782               44.12
twitter_username      

In [8]:
# Identify columns to drop based on high missing percentage or redundancy
print("\nIdentifying columns to drop...")

# Columns with very high missing values (>95%) except 'closed_at' which we will keep for age calculation
high_missing_cols = missing_info[missing_info['Missing Percentage'] > 95].index.tolist()
missing_info = missing_info.drop('closed_at', errors='ignore')
print(f"Columns with >95% missing values: {high_missing_cols}")

# Redundant identifier columns
redundant_cols = ['Unnamed: 0.1']  # This appears to be just a row index
print(f"Redundant columns: {redundant_cols}")

# Columns with limited analytical value and categorical cols
limited_value_cols = ['entity_id', 'id', 'created_by', 'lat', 'domain', 'twitter_username', 'homepage_url', 'permalink', 'normalized_name', 'lng', 'tag_list', 'overview', 'description', 'short_description', 'logo_url', 'logo_width', 'logo_height', 'parent_id']
print(f"Columns with limited analytical value: {limited_value_cols}")

# Combine all columns to drop
columns_to_drop = high_missing_cols + redundant_cols + limited_value_cols
print(f"\nTotal columns to drop: {len(columns_to_drop)}")
print(columns_to_drop)

# Drop the identified columns
df = pd.read_csv('usa_companies.csv', sep=',', header=0)
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')
print(f"\nShape after dropping columns: {df_cleaned.shape}")


Identifying columns to drop...
Columns with >95% missing values: ['parent_id', 'first_investment_at', 'last_investment_at', 'investment_rounds', 'invested_companies', 'ROI']
Redundant columns: ['Unnamed: 0.1']
Columns with limited analytical value: ['entity_id', 'id', 'created_by', 'lat', 'domain', 'twitter_username', 'homepage_url', 'permalink', 'normalized_name', 'lng', 'tag_list', 'overview', 'description', 'short_description', 'logo_url', 'logo_width', 'logo_height', 'parent_id']

Total columns to drop: 25
['parent_id', 'first_investment_at', 'last_investment_at', 'investment_rounds', 'invested_companies', 'ROI', 'Unnamed: 0.1', 'entity_id', 'id', 'created_by', 'lat', 'domain', 'twitter_username', 'homepage_url', 'permalink', 'normalized_name', 'lng', 'tag_list', 'overview', 'description', 'short_description', 'logo_url', 'logo_width', 'logo_height', 'parent_id']

Shape after dropping columns: (51637, 20)


In [9]:
# Check for outliers in numeric columns
print("\nChecking for outliers in numeric columns...")

# Function to detect outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    return outliers.count()

# Check outliers in key numeric columns
numeric_cols_for_outliers = ['funding_total_usd', 'funding_rounds', 'relationships', 'age_years']
for col in numeric_cols_for_outliers:
    if col in df_cleaned.columns and df_cleaned[col].notnull().sum() > 0:
        outlier_count = detect_outliers(df_cleaned, col)
        print(f"Found {outlier_count} outliers in {col}")

# For funding_total_usd, cap extreme values at 99.9 percentile
if 'funding_total_usd' in df_cleaned.columns:
    cap_value = df_cleaned['funding_total_usd'].quantile(0.999)
    extreme_values = df_cleaned['funding_total_usd'] > cap_value
    if extreme_values.sum() > 0:
        print(f"Capping {extreme_values.sum()} extreme funding values at {cap_value}")
        df_cleaned.loc[extreme_values, 'funding_total_usd'] = cap_value


Checking for outliers in numeric columns...
Found 2137 outliers in funding_total_usd
Found 2024 outliers in funding_rounds
Found 3273 outliers in relationships
Capping 19 extreme funding values at 833952250.0000048


In [10]:
# Calculate company age and add as a new column 'age'
# Define time_now as the end of 2014
time_now = pd.to_datetime('2014-12-31') # the last founding date in the dataset is 2014-09-30

# Convert date columns to datetime
df_cleaned['founded_at'] = pd.to_datetime(df_cleaned['founded_at'], errors='coerce')
df_cleaned['closed_at'] = pd.to_datetime(df_cleaned['closed_at'], errors='coerce')

# Create age column
df_cleaned['age'] = np.nan

# Calculate age for companies that have closed
closed_mask = ~df_cleaned['closed_at'].isna() & ~df_cleaned['founded_at'].isna()
df_cleaned.loc[closed_mask, 'age'] = (df_cleaned.loc[closed_mask, 'closed_at'] - 
                                     df_cleaned.loc[closed_mask, 'founded_at']).dt.days / 365.25

# Calculate age for companies still operating
operating_mask = df_cleaned['closed_at'].isna() & ~df_cleaned['founded_at'].isna()
df_cleaned.loc[operating_mask, 'age'] = (time_now - 
                                        df_cleaned.loc[operating_mask, 'founded_at']).dt.days / 365.25

# Calculate median age for companies with valid age
median_age = df_cleaned['age'].median()

# Handle case where median might be NaN
if pd.isna(median_age):
    median_age = 5.0  # Default reasonable value
    print(f"No valid ages found. Using default median age of {median_age:.2f} years")
else:
    print(f"Using median age of {median_age:.2f} years for companies with missing founding dates")

# Assign median age to companies with missing founded_at
missing_founded_mask = df_cleaned['founded_at'].isna()
df_cleaned.loc[missing_founded_mask, 'age'] = median_age

# Round age to 2 decimal places for readability
df_cleaned['age'] = df_cleaned['age'].round(2)

# Display age statistics
print("\nCompany age statistics:")
print(df_cleaned['age'].describe())

# Count of companies by age calculation method
print("\nAge calculation breakdown:")
print(f"Companies with closed date: {closed_mask.sum()}")
print(f"Active companies with founding date: {operating_mask.sum()}")
print(f"Companies using median age: {missing_founded_mask.sum()}")

Using median age of 7.00 years for companies with missing founding dates

Company age statistics:
count   51637.00
mean        9.39
std         9.43
min       -40.31
25%         5.00
50%         7.00
75%        10.00
max       114.00
Name: age, dtype: float64

Age calculation breakdown:
Companies with closed date: 1089
Active companies with founding date: 37536
Companies using median age: 13012


In [11]:
# Find all rows with negative age values
negative_age_rows = df_cleaned[df_cleaned['age'] < 0]

# Display the count
print(f"Found {len(negative_age_rows)} rows with negative age values")

# Display the negative age rows
print("\nRows with negative age values:")
print(negative_age_rows)

# Optional: Display just the key date columns to understand the issue
if not negative_age_rows.empty:
    print("\nKey date information for negative age rows:")
    for idx, row in negative_age_rows.iterrows():
        print(f"\nRow index: {idx}")
        print(f"Company name: {row.get('name', 'N/A')}")
        print(f"Age: {row['age']}")
        print(f"Founded at: {row['founded_at']}")
        print(f"Closed at: {row['closed_at']}")

Found 24 rows with negative age values

Rows with negative age values:
      entity_type                          name category_code    status founded_at  closed_at country_code state_code           city              region first_funding_at last_funding_at  funding_rounds  funding_total_usd first_milestone_at last_milestone_at  milestones  relationships           created_at           updated_at    age
1438      Company                        Zooomr           web    closed 2006-03-01 2006-01-01          USA         CA  San Francisco              SF Bay       2006-02-01      2006-02-01            1.00           50000.00                NaN               NaN         NaN           2.00  2007-07-11 02:02:21  2013-12-04 00:45:01  -0.16
2609      Company                  Bluebox Now!   games_video    closed 2011-08-08 2011-01-01          USA         WA        Seattle             Seattle       2011-11-01      2011-11-01            1.00                NaN         2011-06-12        2011-06-12    

In [12]:
# Removing companies with negative age values
# Find rows with negative age values
negative_age_rows = df_cleaned[df_cleaned['age'] < 0]
print(f"Found {len(negative_age_rows)} rows with negative age values")

# Store the original length of the dataset
original_length = len(df_cleaned)

# Remove rows with negative age values
df_cleaned = df_cleaned[df_cleaned['age'] >= 0]

# Confirm removal
print(f"Removed {original_length - len(df_cleaned)} rows with negative ages")
print(f"Dataset now has {len(df_cleaned)} rows")
print(f"New minimum age: {df_cleaned['age'].min()}")

# Display age statistics after removal
print("\nCompany age statistics after removing negative values:")
print(df_cleaned['age'].describe())

Found 24 rows with negative age values
Removed 24 rows with negative ages
Dataset now has 51613 rows
New minimum age: 0.0

Company age statistics after removing negative values:
count   51613.00
mean        9.40
std         9.42
min         0.00
25%         5.00
50%         7.00
75%        10.00
max       114.00
Name: age, dtype: float64


In [13]:
# Select only the specified columns
columns_to_keep = ['status', 'funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'age']

# Create a new dataframe with only these columns
df_cleaned = df_cleaned[columns_to_keep]

# Display information about the resulting dataframe
print(f"DataFrame now has {len(df_cleaned)} rows and {len(df_cleaned.columns)} columns")
print("\nColumns in the dataframe:")
print(df_cleaned.columns.tolist())

# Display first few rows of the resulting dataframe
print("\nFirst 5 rows of the dataframe:")
print(df_cleaned.head())

# Display basic statistics
print("\nSummary statistics:")
print(df_cleaned.describe())

DataFrame now has 51613 rows and 6 columns

Columns in the dataframe:
['status', 'funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'age']

First 5 rows of the dataframe:
      status  funding_rounds  funding_total_usd  milestones  relationships  age
0  operating            3.00        39750000.00        5.00          17.00 9.20
1   acquired             NaN                NaN         NaN           6.00 7.00
2   acquired             NaN                NaN        4.00          12.00 7.00
3  operating             NaN                NaN        1.00           2.00 7.00
4  operating             NaN                NaN        1.00           2.00 6.41

Summary statistics:
       funding_rounds  funding_total_usd  milestones  relationships      age
count        19715.00           18131.00    26656.00       38143.00 51613.00
mean             1.83        16212183.93        1.46           4.95     9.40
std              1.35        46910582.38        0.77          15.68     9.42
m

In [14]:
# Save the cleaned dataset
print("\nSaving cleaned dataset...")
df_cleaned.to_csv('usa_cleaned_companies.csv', index=False)
print("Saved cleaned dataset to 'usa_cleaned_companies.csv'")


Saving cleaned dataset...
Saved cleaned dataset to 'usa_cleaned_companies.csv'


Printing the missing percentages from the targeted columns

In [15]:
# Check for missing values
missing_values = df_cleaned.isnull().sum()
missing_percentage = (missing_values / len(df_cleaned)) * 100

# Create a DataFrame to display missing values information
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

print("\nMissing values analysis:")
print(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Percentage', ascending=False))



Missing values analysis:
                   Missing Values  Missing Percentage
funding_total_usd           33482               64.87
funding_rounds              31898               61.80
milestones                  24957               48.35
relationships               13470               26.10


In [None]:
# handling missing values
df_cleaned['funding_total_usd'] = df_cleaned['funding_total_usd'].fillna(0)
df_cleaned['funding_rounds'] = df_cleaned['funding_rounds'].fillna(0)
df_cleaned['milestones'] = df_cleaned['milestones'].fillna(0)
df_cleaned['relationships'] = df_cleaned['relationships'].fillna(0)

In [17]:
# Display information about the resulting dataframe
print(f"DataFrame now has {len(df_cleaned)} rows and {len(df_cleaned.columns)} columns")
print("\nColumns in the dataframe:")
print(df_cleaned.columns.tolist())

# Display first few rows of the resulting dataframe
print("\nFirst 5 rows of the dataframe:")
print(df_cleaned.head())

# Display basic statistics
print("\nSummary statistics:")
print(df_cleaned.describe())

DataFrame now has 51613 rows and 6 columns

Columns in the dataframe:
['status', 'funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'age']

First 5 rows of the dataframe:
      status  funding_rounds  funding_total_usd  milestones  relationships  age
0  operating            3.00        39750000.00        5.00          17.00 9.20
1   acquired            0.00               0.00        0.00           6.00 7.00
2   acquired            0.00               0.00        4.00          12.00 7.00
3  operating            0.00               0.00        1.00           2.00 7.00
4  operating            0.00               0.00        1.00           2.00 6.41

Summary statistics:
       funding_rounds  funding_total_usd  milestones  relationships      age
count        51613.00           51613.00    51613.00       51613.00 51613.00
mean             0.70         5695137.02        0.75           3.66     9.40
std              1.22        28860244.02        0.92          13.65     9.42
m

In [18]:
# Check for missing values
missing_values = df_cleaned.isnull().sum()
missing_percentage = (missing_values / len(df_cleaned)) * 100

# Create a DataFrame to display missing values information
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})

print("\nMissing values analysis:")
print(missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Percentage', ascending=False))



Missing values analysis:
Empty DataFrame
Columns: [Missing Values, Missing Percentage]
Index: []


In [19]:
# Display information about the resulting dataframe
print(f"DataFrame now has {len(df_cleaned)} rows and {len(df_cleaned.columns)} columns")
print("\nColumns in the dataframe:")
print(df_cleaned.columns.tolist())

# Display first few rows of the resulting dataframe
print("\nFirst 5 rows of the dataframe:")
print(df_cleaned.head())

# Display basic statistics
print("\nSummary statistics:")
print(df_cleaned.describe())

DataFrame now has 51613 rows and 6 columns

Columns in the dataframe:
['status', 'funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'age']

First 5 rows of the dataframe:
      status  funding_rounds  funding_total_usd  milestones  relationships  age
0  operating            3.00        39750000.00        5.00          17.00 9.20
1   acquired            0.00               0.00        0.00           6.00 7.00
2   acquired            0.00               0.00        4.00          12.00 7.00
3  operating            0.00               0.00        1.00           2.00 7.00
4  operating            0.00               0.00        1.00           2.00 6.41

Summary statistics:
       funding_rounds  funding_total_usd  milestones  relationships      age
count        51613.00           51613.00    51613.00       51613.00 51613.00
mean             0.70         5695137.02        0.75           3.66     9.40
std              1.22        28860244.02        0.92          13.65     9.42
m

In [20]:
# Display final dataset information
print("\nFinal dataset information:")
print(df_cleaned.info())

# Display sample of cleaned data
print("\nSample of cleaned data:")
print(df_cleaned.head())

# Display success metrics distribution
print("\nSuccess metrics distribution:")
if 'success_binary' in df_cleaned.columns:
    print(f"Success binary distribution:\n{df_cleaned['success_binary'].value_counts(dropna=False)}")
if 'success_class' in df_cleaned.columns:
    print(f"Success class distribution:\n{df_cleaned['success_class'].value_counts(dropna=False)}")


Final dataset information:
<class 'pandas.core.frame.DataFrame'>
Index: 51613 entries, 0 to 51636
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   status             51613 non-null  object 
 1   funding_rounds     51613 non-null  float64
 2   funding_total_usd  51613 non-null  float64
 3   milestones         51613 non-null  float64
 4   relationships      51613 non-null  float64
 5   age                51613 non-null  float64
dtypes: float64(5), object(1)
memory usage: 2.8+ MB
None

Sample of cleaned data:
      status  funding_rounds  funding_total_usd  milestones  relationships  age
0  operating            3.00        39750000.00        5.00          17.00 9.20
1   acquired            0.00               0.00        0.00           6.00 7.00
2   acquired            0.00               0.00        4.00          12.00 7.00
3  operating            0.00               0.00        1.00           2.00 7.00
4  

In [21]:
# Save the final dataset
print("\nSaving final dataset...")
df_cleaned.to_csv('usa_final_companies.csv', index=False)
print("Saved final dataset to 'usa_final_companies.csv'")


Saving final dataset...
Saved final dataset to 'usa_final_companies.csv'
