# Data Cleaning and Preprocessing
Clean the dataset by handling missing values, removing duplicates, correcting data types, and standardizing formats. This step ensures the data is ready for analysis and modeling.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
import os

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
# Load the dataset with proper data types
print("Loading dataset...")
df = pd.read_csv('companies_with_success_labels.csv', 
                parse_dates=['founded_at', 'closed_at', 'first_investment_at', 
                            'last_investment_at', 'first_funding_at', 'last_funding_at',
                            'first_milestone_at', 'last_milestone_at', 'created_at', 'updated_at'],
                low_memory=False)

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Number of companies: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

Loading dataset...
Dataset shape: (196553, 52)
Number of companies: 196553
Number of features: 52


In [3]:
# Check for duplicate rows
print("\nChecking for duplicates...")
duplicate_rows = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

# Check for duplicate company identifiers
duplicate_ids = df['id'].duplicated().sum()
print(f"Number of duplicate IDs: {duplicate_ids}")

# Remove duplicate rows if any
if duplicate_rows > 0:
    df = df.drop_duplicates()
    print(f"Removed {duplicate_rows} duplicate rows. New shape: {df.shape}")


Checking for duplicates...
Number of duplicate rows: 0
Number of duplicate IDs: 0


In [4]:
# Check missing values
print("\nChecking missing values...")
missing_values = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
})
print(missing_info.sort_values('Missing Percentage', ascending=False).head(20))


Checking missing values...
                     Missing Values  Missing Percentage
parent_id                    196553              100.00
ROI                          195827               99.63
first_investment_at          193970               98.69
last_investment_at           193970               98.69
invested_companies           193962               98.68
investment_rounds            193962               98.68
closed_at                    193933               98.67
short_description            189422               96.37
funding_total_usd            168679               85.82
last_funding_at              165046               83.97
first_funding_at             165046               83.97
funding_rounds               164846               83.87
state_code                   145650               74.10
twitter_username             115962               59.00
tag_list                     115101               58.56
lat                          112701               57.34
lng                 

In [5]:
# Identify columns to drop based on high missing percentage or redundancy
print("\nIdentifying columns to drop...")

# Columns with very high missing values (>95%) that aren't critical
high_missing_cols = missing_info[missing_info['Missing Percentage'] > 95].index.tolist()
print(f"Columns with >95% missing values: {high_missing_cols}")

# Redundant identifier columns
redundant_cols = ['Unnamed: 0.1']  # This appears to be just a row index
print(f"Redundant columns: {redundant_cols}")

# Columns with limited analytical value
limited_value_cols = ['logo_url', 'logo_width', 'logo_height', 'parent_id']
print(f"Columns with limited analytical value: {limited_value_cols}")

# Combine all columns to drop
columns_to_drop = high_missing_cols + redundant_cols + limited_value_cols
print(f"\nTotal columns to drop: {len(columns_to_drop)}")
print(columns_to_drop)

# Drop the identified columns
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')
print(f"\nShape after dropping columns: {df_cleaned.shape}")


Identifying columns to drop...
Columns with >95% missing values: ['parent_id', 'closed_at', 'short_description', 'first_investment_at', 'last_investment_at', 'investment_rounds', 'invested_companies', 'ROI']
Redundant columns: ['Unnamed: 0.1']
Columns with limited analytical value: ['logo_url', 'logo_width', 'logo_height', 'parent_id']

Total columns to drop: 13
['parent_id', 'closed_at', 'short_description', 'first_investment_at', 'last_investment_at', 'investment_rounds', 'invested_companies', 'ROI', 'Unnamed: 0.1', 'logo_url', 'logo_width', 'logo_height', 'parent_id']

Shape after dropping columns: (196553, 40)


In [6]:
# Handle missing values for important columns
print("\nHandling missing values for important columns...")

# For categorical columns, fill with 'unknown'
categorical_cols = ['category_code', 'country_code', 'state_code', 'city', 'region']
for col in categorical_cols:
    if col in df_cleaned.columns:
        missing_before = df_cleaned[col].isnull().sum()
        df_cleaned[col] = df_cleaned[col].fillna('unknown')
        print(f"Filled {missing_before} missing values in {col} with 'unknown'")

# For text columns, fill with empty string
text_cols = ['name', 'normalized_name', 'domain', 'homepage_url', 'twitter_username', 
             'short_description', 'description', 'overview', 'tag_list']
for col in text_cols:
    if col in df_cleaned.columns:
        missing_before = df_cleaned[col].isnull().sum()
        df_cleaned[col] = df_cleaned[col].fillna('')
        print(f"Filled {missing_before} missing values in {col} with empty string")

# For numeric columns related to funding, fill with 0
funding_cols = ['funding_rounds', 'funding_total_usd', 'investment_rounds', 'invested_companies', 'milestones', 'relationships']
for col in funding_cols:
    if col in df_cleaned.columns:
        missing_before = df_cleaned[col].isnull().sum()
        df_cleaned[col] = df_cleaned[col].fillna(0)
        print(f"Filled {missing_before} missing values in {col} with 0")

# For date columns, we'll leave as NaN as imputing dates could introduce bias
date_cols = ['founded_at', 'closed_at', 'first_investment_at', 'last_investment_at', 
             'first_funding_at', 'last_funding_at', 'first_milestone_at', 'last_milestone_at']
for col in date_cols:
    if col in df_cleaned.columns:
        print(f"Keeping {df_cleaned[col].isnull().sum()} missing values in {col} as NaN")


Handling missing values for important columns...
Filled 73367 missing values in category_code with 'unknown'
Filled 108563 missing values in country_code with 'unknown'
Filled 145650 missing values in state_code with 'unknown'
Filled 112663 missing values in city with 'unknown'
Filled 0 missing values in region with 'unknown'
Filled 23 missing values in name with empty string
Filled 26 missing values in normalized_name with empty string
Filled 70008 missing values in domain with empty string
Filled 70008 missing values in homepage_url with empty string
Filled 115962 missing values in twitter_username with empty string
Filled 104505 missing values in description with empty string
Filled 69582 missing values in overview with empty string
Filled 115101 missing values in tag_list with empty string
Filled 164846 missing values in funding_rounds with 0
Filled 168679 missing values in funding_total_usd with 0
Filled 104854 missing values in milestones with 0
Filled 66886 missing values in re

In [7]:
# Standardize text fields
print("\nStandardizing text fields...")

# Function to clean text fields
def clean_text(text):
    if pd.isna(text) or text == '':
        return ''
    # Convert to string in case it's not
    text = str(text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply to relevant text columns
text_cols_to_clean = ['name', 'normalized_name', 'short_description', 'description', 'overview']
for col in text_cols_to_clean:
    if col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].apply(clean_text)
        print(f"Cleaned text in {col}")

# Standardize URLs
url_cols = ['homepage_url', 'domain']
for col in url_cols:
    if col in df_cleaned.columns:
        # Convert to lowercase
        df_cleaned[col] = df_cleaned[col].str.lower()
        print(f"Standardized URLs in {col}")


Standardizing text fields...
Cleaned text in name
Cleaned text in normalized_name
Cleaned text in description
Cleaned text in overview
Standardized URLs in homepage_url
Standardized URLs in domain


In [8]:
# Standardize and validate geographic data
print("\nStandardizing geographic data...")

# Convert country codes to uppercase
if 'country_code' in df_cleaned.columns:
    df_cleaned['country_code'] = df_cleaned['country_code'].str.upper()
    print("Converted country codes to uppercase")

# Convert state codes to uppercase
if 'state_code' in df_cleaned.columns:
    df_cleaned['state_code'] = df_cleaned['state_code'].str.upper()
    print("Converted state codes to uppercase")

# Check for invalid latitude/longitude values
if 'lat' in df_cleaned.columns and 'lng' in df_cleaned.columns:
    # Valid latitude range: -90 to 90
    invalid_lat = ((df_cleaned['lat'] < -90) | (df_cleaned['lat'] > 90)) & df_cleaned['lat'].notnull()
    # Valid longitude range: -180 to 180
    invalid_lng = ((df_cleaned['lng'] < -180) | (df_cleaned['lng'] > 180)) & df_cleaned['lng'].notnull()

    print(f"Found {invalid_lat.sum()} invalid latitude values")
    print(f"Found {invalid_lng.sum()} invalid longitude values")

    # Set invalid coordinates to NaN
    df_cleaned.loc[invalid_lat, 'lat'] = np.nan
    df_cleaned.loc[invalid_lng, 'lng'] = np.nan
    print("Set invalid coordinates to NaN")


Standardizing geographic data...
Converted country codes to uppercase
Converted state codes to uppercase
Found 0 invalid latitude values
Found 0 invalid longitude values
Set invalid coordinates to NaN


In [9]:
# Correct data types
print("\nCorrecting data types...")

# Ensure numeric columns are properly typed
numeric_cols = ['entity_id', 'funding_rounds', 'funding_total_usd', 'investment_rounds', 
                'invested_companies', 'milestones', 'relationships', 'ROI', 
                'success_status', 'success_funding', 'success_roi', 'success_age', 
                'success_score', 'success_binary']

for col in numeric_cols:
    if col in df_cleaned.columns:
        # Check if column contains any non-numeric values
        non_numeric = pd.to_numeric(df_cleaned[col], errors='coerce').isnull() & df_cleaned[col].notnull()
        if non_numeric.sum() > 0:
            print(f"Found {non_numeric.sum()} non-numeric values in {col}")
            # Convert to numeric, coercing errors to NaN
            df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
        else:
            # Convert to appropriate numeric type
            if col in ['entity_id', 'funding_rounds', 'investment_rounds', 
                      'invested_companies', 'milestones', 'relationships',
                      'success_status', 'success_funding', 'success_roi', 
                      'success_age', 'success_score', 'success_binary']:
                df_cleaned[col] = df_cleaned[col].astype('Int64')  # nullable integer type
            else:
                df_cleaned[col] = pd.to_numeric(df_cleaned[col])
        print(f"Converted {col} to numeric type")

# Ensure categorical columns are properly typed
categorical_cols = ['entity_type', 'status', 'category_code', 'country_code', 
                   'state_code', 'region', 'success_class']
for col in categorical_cols:
    if col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].astype('category')
        print(f"Converted {col} to category type")


Correcting data types...
Converted entity_id to numeric type
Converted funding_rounds to numeric type
Converted funding_total_usd to numeric type
Converted milestones to numeric type
Converted relationships to numeric type
Converted success_status to numeric type
Converted success_funding to numeric type
Converted success_roi to numeric type
Converted success_age to numeric type
Converted success_score to numeric type
Converted success_binary to numeric type
Converted entity_type to category type
Converted status to category type
Converted category_code to category type
Converted country_code to category type
Converted state_code to category type
Converted region to category type
Converted success_class to category type


In [10]:
# Create additional cleaning flags for tracking data quality
print("\nCreating data quality flags...")

# Flag for companies with complete core information
core_cols = ['name', 'status', 'category_code']
df_cleaned['has_complete_core_info'] = df_cleaned[core_cols].notnull().all(axis=1)
print(f"Companies with complete core information: {df_cleaned['has_complete_core_info'].sum()}")

# Flag for companies with location information
location_cols = ['country_code', 'city']
df_cleaned['has_location_info'] = df_cleaned[location_cols].notnull().all(axis=1)
print(f"Companies with location information: {df_cleaned['has_location_info'].sum()}")

# Flag for companies with funding information
funding_cols = ['funding_rounds', 'funding_total_usd']
df_cleaned['has_funding_info'] = df_cleaned[funding_cols].notnull().all(axis=1)
print(f"Companies with funding information: {df_cleaned['has_funding_info'].sum()}")

# Flag for companies with founding date
df_cleaned['has_founding_date'] = df_cleaned['founded_at'].notnull()
print(f"Companies with founding date: {df_cleaned['has_founding_date'].sum()}")


Creating data quality flags...
Companies with complete core information: 196553
Companies with location information: 196553
Companies with funding information: 196553
Companies with founding date: 91227


In [11]:
# Check for outliers in numeric columns
print("\nChecking for outliers in numeric columns...")

# Function to detect outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    return outliers.count()

# Check outliers in key numeric columns
numeric_cols_for_outliers = ['funding_total_usd', 'funding_rounds', 'relationships', 'age_years']
for col in numeric_cols_for_outliers:
    if col in df_cleaned.columns and df_cleaned[col].notnull().sum() > 0:
        outlier_count = detect_outliers(df_cleaned, col)
        print(f"Found {outlier_count} outliers in {col}")

# For funding_total_usd, cap extreme values at 99.9 percentile
if 'funding_total_usd' in df_cleaned.columns:
    cap_value = df_cleaned['funding_total_usd'].quantile(0.999)
    extreme_values = df_cleaned['funding_total_usd'] > cap_value
    if extreme_values.sum() > 0:
        print(f"Capping {extreme_values.sum()} extreme funding values at {cap_value}")
        df_cleaned.loc[extreme_values, 'funding_total_usd'] = cap_value


Checking for outliers in numeric columns...
Found 27874 outliers in funding_total_usd
Found 31707 outliers in funding_rounds
Found 12366 outliers in relationships
Found 6009 outliers in age_years
Capping 197 extreme funding values at 223068000.00011572


In [12]:
# Create a summary of the cleaning process
print("\nCreating cleaning summary...")

# Original shape
original_shape = df.shape
# Cleaned shape
cleaned_shape = df_cleaned.shape

# Calculate changes
rows_removed = original_shape[0] - cleaned_shape[0]

# Get unique columns dropped (remove duplicates)
unique_columns_dropped = list(dict.fromkeys(columns_to_drop))

# Check which columns from columns_to_drop actually existed in the original dataframe
columns_actually_dropped = [col for col in unique_columns_dropped if col in df.columns]
print(f"Columns in drop list that existed in original dataframe: {len(columns_actually_dropped)} out of {len(unique_columns_dropped)}")

# Calculate the actual number of columns removed
columns_removed = original_shape[1] - cleaned_shape[1]
print(f"Difference in column count between original and cleaned dataframes: {columns_removed}")

# Check for new columns that might have been added during cleaning
original_cols_set = set(df.columns)
cleaned_cols_set = set(df_cleaned.columns)
new_columns = cleaned_cols_set - original_cols_set
if new_columns:
    print(f"New columns added during cleaning: {new_columns}")

# Create a summary dictionary
cleaning_summary = {
    'Original Rows': original_shape[0],
    'Original Columns': original_shape[1],
    'Cleaned Rows': cleaned_shape[0],
    'Cleaned Columns': cleaned_shape[1],
    'Rows Removed': rows_removed,
    'Columns Removed': columns_removed,
    'Duplicate Rows Found': duplicate_rows,
    'Columns Dropped': columns_actually_dropped,  # Use the columns that actually existed
    'Companies with Complete Core Info': df_cleaned['has_complete_core_info'].sum(),
    'Companies with Location Info': df_cleaned['has_location_info'].sum(),
    'Companies with Funding Info': df_cleaned['has_funding_info'].sum(),
    'Companies with Founding Date': df_cleaned['has_founding_date'].sum()
}

# Print summary
for key, value in cleaning_summary.items():
    print(f"{key}: {value}")

# Add a verification check to ensure our counts match
print("\nVerification:")
print(f"Reported columns removed: {columns_removed}")
print(f"Actual columns dropped: {len(columns_actually_dropped)}")

# Detailed analysis if there's a mismatch
if columns_removed != len(columns_actually_dropped):
    print("Warning: Mismatch between reported columns removed and actual columns dropped!")
    
    # Find columns that were in the original but not in the cleaned dataframe
    missing_cols = original_cols_set - cleaned_cols_set
    print(f"Columns in original but not in cleaned dataframe: {missing_cols}")
    
    # Check which of these missing columns were in our drop list
    expected_drops = [col for col in missing_cols if col in columns_to_drop]
    print(f"Columns correctly dropped (in drop list): {expected_drops}")
    
    # Find columns that were dropped but not in our drop list
    unexpected_drops = [col for col in missing_cols if col not in columns_to_drop]
    if unexpected_drops:
        print(f"Columns dropped but not in our drop list: {unexpected_drops}")
    
    # Find columns that were in columns_to_drop but not actually dropped
    not_dropped = [col for col in columns_actually_dropped if col in cleaned_cols_set]
    if not_dropped:
        print(f"Columns in drop list but not actually dropped: {not_dropped}")
    
    # Find columns that were in columns_to_drop but not in the original dataframe
    nonexistent_cols = [col for col in columns_to_drop if col not in df.columns]
    if nonexistent_cols:
        print(f"Columns in drop list that didn't exist in original dataframe: {nonexistent_cols}")
    
    # Calculate the expected number of columns after dropping
    expected_cols_after_drop = original_shape[1] - len(columns_actually_dropped) + len(new_columns)
    print(f"Expected columns after dropping: {expected_cols_after_drop}")
    print(f"Actual columns after dropping: {cleaned_shape[1]}")
    
    # Reconcile the difference
    if len(new_columns) > 0:
        print(f"Difference explained by {len(new_columns)} new columns added during cleaning: {new_columns}")
    
    if len(not_dropped) > 0:
        print(f"Difference explained by {len(not_dropped)} columns in drop list not actually dropped: {not_dropped}")


Creating cleaning summary...
Columns in drop list that existed in original dataframe: 12 out of 12
Difference in column count between original and cleaned dataframes: 8
New columns added during cleaning: {'has_founding_date', 'has_funding_info', 'has_location_info', 'has_complete_core_info'}
Original Rows: 196553
Original Columns: 52
Cleaned Rows: 196553
Cleaned Columns: 44
Rows Removed: 0
Columns Removed: 8
Duplicate Rows Found: 0
Columns Dropped: ['parent_id', 'closed_at', 'short_description', 'first_investment_at', 'last_investment_at', 'investment_rounds', 'invested_companies', 'ROI', 'Unnamed: 0.1', 'logo_url', 'logo_width', 'logo_height']
Companies with Complete Core Info: 196553
Companies with Location Info: 196553
Companies with Funding Info: 196553
Companies with Founding Date: 91227

Verification:
Reported columns removed: 8
Actual columns dropped: 12
Columns in original but not in cleaned dataframe: {'first_investment_at', 'closed_at', 'invested_companies', 'parent_id', 'RO

In [13]:
# Save the cleaned dataset
print("\nSaving cleaned dataset...")
df_cleaned.to_csv('cleaned_companies.csv', index=False)
print("Saved cleaned dataset to 'cleaned_companies.csv'")


Saving cleaned dataset...
Saved cleaned dataset to 'cleaned_companies.csv'


In [14]:
# Generate cleaning report markdown file
print("\nGenerating cleaning report...")

report_content = f"""# Data Cleaning Report for Crunchbase Companies Dataset

## Overview
This report documents the data cleaning process applied to the Crunchbase companies dataset for the startup success prediction project.

## Dataset Information
- **Original Dataset**: {original_shape[0]} rows, {original_shape[1]} columns
- **Cleaned Dataset**: {cleaned_shape[0]} rows, {cleaned_shape[1]} columns
- **Rows Removed**: {rows_removed}
- **Columns Removed**: {columns_removed}

## Cleaning Steps Performed

### 1. Duplicate Handling
- Checked for duplicate rows: {duplicate_rows} found and removed
- Checked for duplicate company IDs: {duplicate_ids} found

### 2. Column Removal
The following columns were removed:
- **High Missing Value Columns** (>95% missing): {', '.join(high_missing_cols)}
- **Redundant Columns**: {', '.join(redundant_cols)}
- **Limited Analytical Value Columns**: {', '.join(limited_value_cols)}

### 3. Missing Value Treatment
- **Categorical Columns**: Filled with 'unknown' ({', '.join(categorical_cols)})
- **Text Columns**: Filled with empty string ({', '.join(text_cols)})
- **Numeric Funding Columns**: Filled with 0 ({', '.join(funding_cols)})
- **Date Columns**: Kept as NaN to avoid introducing bias ({', '.join(date_cols)})

### 4. Text Standardization
- Cleaned and standardized text in: {', '.join(text_cols_to_clean)}
- Standardized URLs in: {', '.join(url_cols)}

### 5. Geographic Data Standardization
- Converted country codes to uppercase
- Converted state codes to uppercase
- Validated latitude/longitude values

### 6. Data Type Correction
- Converted numeric columns to appropriate numeric types
- Converted categorical columns to category type

### 7. Data Quality Flags
- **Companies with Complete Core Info**: {df_cleaned['has_complete_core_info'].sum()} ({df_cleaned['has_complete_core_info'].mean()*100:.2f}%)
- **Companies with Location Info**: {df_cleaned['has_location_info'].sum()} ({df_cleaned['has_location_info'].mean()*100:.2f}%)
- **Companies with Funding Info**: {df_cleaned['has_funding_info'].sum()} ({df_cleaned['has_funding_info'].mean()*100:.2f}%)
- **Companies with Founding Date**: {df_cleaned['has_founding_date'].sum()} ({df_cleaned['has_founding_date'].mean()*100:.2f}%)

### 8. Outlier Handling
"""

# Add outlier information to report
for col in numeric_cols_for_outliers:
    if col in df_cleaned.columns and df_cleaned[col].notnull().sum() > 0:
        outlier_count = detect_outliers(df_cleaned, col)
        report_content += f"- **{col}**: {outlier_count} outliers detected\n"

if 'funding_total_usd' in df_cleaned.columns:
    cap_value = df_cleaned['funding_total_usd'].quantile(0.999)
    extreme_values = df_cleaned['funding_total_usd'] > cap_value
    report_content += f"- Capped {extreme_values.sum()} extreme funding values at {cap_value:.2f}\n"

report_content += """
## Impact on Analysis
The cleaning process has:
1. Removed redundant and low-value columns to focus the analysis
2. Handled missing values appropriately based on column context
3. Standardized text and geographic data for consistency
4. Added data quality flags to help filter companies for analysis
5. Addressed outliers in key numeric columns

## Next Steps
The cleaned dataset is now ready for feature engineering, where we will:
1. Create derived features from existing data
2. Extract insights from text fields
3. Develop time-based metrics
4. Prepare the data for modeling
"""

# Write report to file
with open('cleaning_report.md', 'w') as f:
    f.write(report_content)

print("Generated cleaning report: 'cleaning_report.md'")


Generating cleaning report...
Generated cleaning report: 'cleaning_report.md'


In [15]:
# Display final dataset information
print("\nFinal dataset information:")
print(df_cleaned.info())

# Display sample of cleaned data
print("\nSample of cleaned data:")
print(df_cleaned.head())

# Display success metrics distribution
print("\nSuccess metrics distribution:")
if 'success_binary' in df_cleaned.columns:
    print(f"Success binary distribution:\n{df_cleaned['success_binary'].value_counts(dropna=False)}")
if 'success_class' in df_cleaned.columns:
    print(f"Success class distribution:\n{df_cleaned['success_class'].value_counts(dropna=False)}")


Final dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196553 entries, 0 to 196552
Data columns (total 44 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   id                      196553 non-null  object        
 1   entity_type             196553 non-null  category      
 2   entity_id               196553 non-null  Int64         
 3   name                    196553 non-null  object        
 4   normalized_name         196553 non-null  object        
 5   permalink               196553 non-null  object        
 6   category_code           196553 non-null  category      
 7   status                  196553 non-null  category      
 8   founded_at              91227 non-null   datetime64[ns]
 9   domain                  196553 non-null  object        
 10  homepage_url            196553 non-null  object        
 11  twitter_username        196553 non-null  object        
 12  de