# Feature Engineering
Create new features that might be predictive of success, such as company age, time to first funding, funding per year, geographic features, and text-based features from descriptions. This step enhances the predictive power of our models.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
from collections import Counter

# Try to import nltk, and install it if not available
try:
    import nltk
    from nltk.corpus import stopwords
except ModuleNotFoundError:
    print("NLTK not found. Installing NLTK...")
    import sys
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])
    import nltk
    from nltk.corpus import stopwords
    print("NLTK installed successfully.")

import warnings
warnings.filterwarnings('ignore')

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

print("Starting feature engineering process...")

NLTK not found. Installing NLTK...
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl (288 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.1.8 nltk-3.9.1 regex-2024.11.6 tqdm-4.67.1
NLTK installed successfully.
Starting feature engineering process...


In [2]:
# Load the cleaned companies dataset
df = pd.read_csv('cleaned_companies.csv')

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Number of companies: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

# Check for missing values
missing_values = df.isnull().sum()
print("\nTop 10 columns with most missing values:")
print(missing_values.sort_values(ascending=False).head(10))

Dataset shape: (196553, 44)
Number of companies: 196553
Number of features: 44

Top 10 columns with most missing values:
last_funding_at       165046
first_funding_at      165046
twitter_username      115962
tag_list              115101
lng                   112701
lat                   112701
age_years             105326
founded_at            105326
first_milestone_at    104854
last_milestone_at     104854
dtype: int64


In [3]:
# Convert date columns to datetime
date_columns = ['founded_at', 'first_funding_at', 'last_funding_at', 
                'first_milestone_at', 'last_milestone_at', 'created_at', 'updated_at']

for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

print("Date columns converted to datetime format")

Date columns converted to datetime format


In [4]:
# Create a reference date for calculating time-based features
# Using the latest date in the dataset as reference
reference_date = df['updated_at'].max()
print(f"Reference date for time calculations: {reference_date}")

Reference date for time calculations: 2013-12-12 14:28:31


In [5]:
# 1. Temporal Features
print("\nCreating temporal features...")

# Time to first funding (days) - from founding date to first funding
df['days_to_first_funding'] = (df['first_funding_at'] - df['founded_at']).dt.days

# Funding duration (days) - from first to last funding
df['funding_duration_days'] = (df['last_funding_at'] - df['first_funding_at']).dt.days

# Company age at first funding (years)
df['age_at_first_funding_years'] = df['days_to_first_funding'] / 365.25

# Time between creation in database and first funding (might indicate visibility)
df['days_from_creation_to_funding'] = (df['first_funding_at'] - df['created_at']).dt.days

# Milestone frequency (milestones per year of existence)
df['milestone_frequency'] = df['milestones'] / df['age_years'].replace(0, np.nan)

# Time between milestones
df['days_between_milestones'] = (df['last_milestone_at'] - df['first_milestone_at']).dt.days / df['milestones'].replace(0, np.nan)

# Company lifespan (for companies that have closed)
# We'll use the reference date for companies still operating
df['lifespan_days'] = np.where(
    pd.notnull(df['founded_at']),
    (reference_date - df['founded_at']).dt.days,
    np.nan
)

print("Temporal features created")


Creating temporal features...
Temporal features created


In [6]:
# 2. Financial Features
print("\nCreating financial features...")

# Funding per year (funding velocity)
df['funding_per_year'] = df['funding_total_usd'] / df['age_years'].replace(0, np.nan)

# Average funding per round
df['avg_funding_per_round'] = df['funding_total_usd'] / df['funding_rounds'].replace(0, np.nan)

# Funding rounds per year
df['funding_rounds_per_year'] = df['funding_rounds'] / df['age_years'].replace(0, np.nan)

# Funding efficiency (relationships per $1M funding)
df['relationships_per_million'] = df['relationships'] / (df['funding_total_usd'] / 1000000).replace(0, np.nan)

# Binary indicator for having received funding
df['has_received_funding'] = np.where(df['funding_total_usd'] > 0, 1, 0)

# Funding size categories
funding_bins = [0, 100000, 1000000, 10000000, 50000000, float('inf')]
funding_labels = ['No Funding', 'Seed (<100K)', 'Early (<1M)', 'Growth (<10M)', 'Established (>10M)']
df['funding_category'] = pd.cut(df['funding_total_usd'], bins=funding_bins, labels=funding_labels)

print("Financial features created")


Creating financial features...
Financial features created


In [7]:
# 3. Geographic Features
print("\nCreating geographic features...")

# Create a feature for major tech hubs
tech_hub_cities = ['San Francisco', 'New York', 'Boston', 'Seattle', 'Austin', 'Los Angeles', 
                   'Chicago', 'London', 'Berlin', 'Tel Aviv', 'Beijing', 'Shanghai', 'Tokyo', 
                   'Bangalore', 'Singapore']
tech_hub_regions = ['SF Bay', 'NYC', 'Boston', 'Seattle', 'Austin', 'LA', 
                    'Chicago', 'London', 'Berlin', 'Tel Aviv', 'Beijing', 'Shanghai', 'Tokyo', 
                    'Bangalore', 'Singapore']

df['is_in_tech_hub_city'] = df['city'].isin(tech_hub_cities).astype(int)
df['is_in_tech_hub_region'] = df['region'].isin(tech_hub_regions).astype(int)

# Create a feature for US-based companies
df['is_us_based'] = (df['country_code'] == 'USA').astype(int)

# Create a feature for major US tech states
tech_states = ['CA', 'NY', 'MA', 'WA', 'TX']
df['is_in_tech_state'] = ((df['country_code'] == 'USA') & (df['state_code'].isin(tech_states))).astype(int)

# Create region groups based on continent/major economic regions
def assign_region_group(country):
    if pd.isna(country) or country == 'UNKNOWN':
        return 'Unknown'
    elif country in ['USA', 'CAN']:
        return 'North America'
    elif country in ['GBR', 'DEU', 'FRA', 'ESP', 'ITA', 'NLD', 'CHE', 'SWE', 'DNK', 'NOR', 'FIN', 'BEL', 'AUT', 'IRL', 'PRT', 'GRC']:
        return 'Western Europe'
    elif country in ['RUS', 'POL', 'CZE', 'HUN', 'UKR', 'ROU', 'BGR', 'SVK', 'HRV', 'SRB', 'LTU', 'LVA', 'EST']:
        return 'Eastern Europe'
    elif country in ['CHN', 'JPN', 'KOR', 'TWN', 'HKG', 'SGP']:
        return 'East Asia'
    elif country in ['IND', 'PAK', 'BGD', 'LKA', 'NPL']:
        return 'South Asia'
    elif country in ['AUS', 'NZL']:
        return 'Oceania'
    elif country in ['BRA', 'MEX', 'ARG', 'COL', 'CHL', 'PER', 'VEN']:
        return 'Latin America'
    elif country in ['ISR', 'SAU', 'ARE', 'TUR', 'EGY', 'IRN', 'IRQ', 'QAT', 'KWT', 'JOR', 'LBN']:
        return 'Middle East & North Africa'
    elif country in ['ZAF', 'NGA', 'KEN', 'GHA', 'TZA', 'UGA', 'ETH', 'EGY', 'MAR', 'TUN']:
        return 'Africa'
    else:
        return 'Other'

df['region_group'] = df['country_code'].apply(assign_region_group)

print("Geographic features created")


Creating geographic features...
Geographic features created


In [8]:
# 4. Category and Industry Features
print("\nCreating category and industry features...")

# Create high-level industry groups
tech_categories = ['web', 'software', 'mobile', 'enterprise', 'analytics', 'security', 
                  'search', 'messaging', 'semiconductor', 'hardware', 'network_hosting']
media_categories = ['games_video', 'video', 'music', 'entertainment', 'photo_video', 'news', 'sports']
commerce_categories = ['ecommerce', 'advertising', 'fashion', 'travel', 'real_estate', 'local', 'hospitality']
finance_categories = ['finance', 'payments', 'bitcoin']
health_categories = ['health', 'medical', 'biotech']

df['is_tech'] = df['category_code'].isin(tech_categories).astype(int)
df['is_media'] = df['category_code'].isin(media_categories).astype(int)
df['is_commerce'] = df['category_code'].isin(commerce_categories).astype(int)
df['is_finance'] = df['category_code'].isin(finance_categories).astype(int)
df['is_health'] = df['category_code'].isin(health_categories).astype(int)

# Create a feature for B2B vs B2C
b2b_categories = ['enterprise', 'analytics', 'security', 'network_hosting', 'semiconductor', 
                 'advertising', 'payments', 'medical']
b2c_categories = ['games_video', 'music', 'entertainment', 'ecommerce', 'fashion', 'travel', 
                 'hospitality', 'sports']

df['is_b2b'] = df['category_code'].isin(b2b_categories).astype(int)
df['is_b2c'] = df['category_code'].isin(b2c_categories).astype(int)

# Create a feature for companies with multiple category indicators (might indicate diversification)
category_count = df[['is_tech', 'is_media', 'is_commerce', 'is_finance', 'is_health']].sum(axis=1)
df['category_diversity'] = category_count

print("Category and industry features created")


Creating category and industry features...
Category and industry features created


In [9]:
# 5. Text-based Features
print("\nCreating text-based features...")

# Download NLTK resources if not already available
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Function to count specific keywords in text
def count_keywords(text, keyword_list):
    if pd.isna(text):
        return 0

    # Convert to lowercase and split into words
    words = re.findall(r'\b\w+\b', text.lower())

    # Remove stop words
    words = [word for word in words if word not in stop_words]

    # Count occurrences of keywords
    count = sum(1 for word in words if word in keyword_list)

    return count

# Keywords that might indicate innovation or technical sophistication
tech_keywords = ['ai', 'artificial', 'intelligence', 'machine', 'learning', 'blockchain', 
                'algorithm', 'cloud', 'platform', 'saas', 'api', 'automation', 'data', 
                'analytics', 'predictive', 'innovative', 'technology', 'solution']

# Keywords that might indicate market focus
market_keywords = ['market', 'customer', 'user', 'client', 'consumer', 'audience', 
                  'segment', 'target', 'demographic', 'need', 'demand', 'solution']

# Keywords that might indicate business model clarity
business_keywords = ['revenue', 'monetize', 'business', 'model', 'profit', 'subscription', 
                    'freemium', 'premium', 'enterprise', 'license', 'service', 'product']

# Apply keyword counting to description and overview
print("Counting keywords in text fields...")
df['tech_keyword_count'] = df['overview'].apply(lambda x: count_keywords(x, tech_keywords))
df['market_keyword_count'] = df['overview'].apply(lambda x: count_keywords(x, market_keywords))
df['business_keyword_count'] = df['overview'].apply(lambda x: count_keywords(x, business_keywords))

# Create a feature for description length (might indicate thoroughness)
df['description_length'] = df['description'].fillna('').apply(len)
df['overview_length'] = df['overview'].fillna('').apply(len)

# Create a feature for tag count (might indicate better categorization)
df['tag_count'] = df['tag_list'].fillna('').apply(lambda x: len(x.split(',')) if ',' in x else 0)

print("Text-based features created")


Creating text-based features...
Counting keywords in text fields...


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aminosaurier/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Text-based features created


In [10]:
# 6. Network and Social Features
print("\nCreating network and social features...")

# Binary indicator for having a Twitter account
df['has_twitter'] = df['twitter_username'].notna().astype(int)

# Binary indicator for having a website
df['has_website'] = df['homepage_url'].notna().astype(int)

# Relationship density (relationships per year of existence)
df['relationship_density'] = df['relationships'] / df['age_years'].replace(0, np.nan)

# Create a feature for companies with high relationship counts
relationship_threshold = df['relationships'].quantile(0.75)
df['has_strong_network'] = (df['relationships'] > relationship_threshold).astype(int)

print("Network and social features created")


Creating network and social features...
Network and social features created


In [11]:
# 7. Composite and Interaction Features
print("\nCreating composite and interaction features...")

# Interaction between funding and age (young companies with high funding might be promising)
df['funding_age_ratio'] = df['funding_total_usd'] / (df['age_years'] + 1)  # Adding 1 to avoid division by zero

# Interaction between category and location (tech companies in tech hubs might perform differently)
df['tech_in_hub'] = df['is_tech'] * df['is_in_tech_hub_city']

# Interaction between funding rounds and total funding (efficiency of fundraising)
df['funding_efficiency'] = df['funding_total_usd'] / (df['funding_rounds'] + 0.1)  # Adding 0.1 to avoid division by zero

# Milestone to age ratio (achievement pace)
df['milestone_to_age_ratio'] = df['milestones'] / (df['age_years'] + 0.1)  # Adding 0.1 to avoid division by zero

# Composite score for online presence
df['online_presence_score'] = df['has_twitter'] + df['has_website']

# Composite score for text quality (combination of length and keyword metrics)
df['text_quality_score'] = (
    (df['description_length'] > 0).astype(int) + 
    (df['overview_length'] > 0).astype(int) + 
    (df['tech_keyword_count'] > 0).astype(int) + 
    (df['market_keyword_count'] > 0).astype(int) + 
    (df['business_keyword_count'] > 0).astype(int)
)

print("Composite and interaction features created")


Creating composite and interaction features...
Composite and interaction features created


In [12]:
# 8. Handle missing values in the newly created features
print("\nHandling missing values in new features...")

# For numeric features, fill NaN with 0 or median depending on the feature
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns

# Features where 0 makes sense as a default (absence of something)
zero_default_features = [
    'days_to_first_funding', 'funding_duration_days', 'age_at_first_funding_years',
    'days_from_creation_to_funding', 'milestone_frequency', 'days_between_milestones',
    'funding_per_year', 'avg_funding_per_round', 'funding_rounds_per_year',
    'relationships_per_million', 'tech_keyword_count', 'market_keyword_count',
    'business_keyword_count', 'tag_count'
]

# Fill appropriate features with 0
for feature in zero_default_features:
    if feature in df.columns:
        df[feature] = df[feature].fillna(0)

# For other numeric features, use median
for feature in numeric_features:
    if feature not in zero_default_features and feature in df.columns:
        if feature not in ['lat', 'lng']:  # Skip geographic coordinates
            df[feature] = df[feature].fillna(df[feature].median())

# For categorical features, fill with the most common value
categorical_features = df.select_dtypes(include=['object']).columns
for feature in categorical_features:
    if feature in df.columns and feature not in ['id', 'name', 'normalized_name', 'permalink']:
        df[feature] = df[feature].fillna(df[feature].mode()[0])

print("Missing values handled")


Handling missing values in new features...
Missing values handled


In [13]:
# 9. Check for and handle extreme values/outliers
print("\nHandling extreme values...")

# List of numeric features to check for outliers
numeric_features_to_check = [
    'funding_total_usd', 'funding_per_year', 'avg_funding_per_round',
    'days_to_first_funding', 'funding_duration_days', 'relationships_per_million'
]

# Cap extreme values at the 99th percentile
for feature in numeric_features_to_check:
    if feature in df.columns:
        cap_value = df[feature].quantile(0.99)
        df[feature] = df[feature].clip(upper=cap_value)

print("Extreme values handled")


Handling extreme values...
Extreme values handled


In [14]:
# 10. Save the engineered features dataset
print("\nSaving engineered features dataset...")

# Save to CSV
df.to_csv('engineered_features.csv', index=False)
print("Dataset with engineered features saved to 'engineered_features.csv'")


Saving engineered features dataset...
Dataset with engineered features saved to 'engineered_features.csv'


In [15]:
# 11. Create a feature engineering report
print("\nCreating feature engineering report...")

# Get list of original features and new features
original_features = pd.read_csv('cleaned_companies.csv').columns.tolist()
all_features = df.columns.tolist()
new_features = [f for f in all_features if f not in original_features]

# Create a report
report = "# Feature Engineering Report\n\n"
report += f"## Overview\n\n"
report += f"* Original dataset: {len(original_features)} features\n"
report += f"* Engineered dataset: {len(all_features)} features\n"
report += f"* New features created: {len(new_features)}\n\n"

report += "## Categories of Engineered Features\n\n"

report += "### 1. Temporal Features\n\n"
temporal_features = [
    'days_to_first_funding', 'funding_duration_days', 'age_at_first_funding_years',
    'days_from_creation_to_funding', 'milestone_frequency', 'days_between_milestones',
    'lifespan_days'
]
report += "These features capture time-related aspects of a company's development:\n\n"
for feature in temporal_features:
    if feature in new_features:
        report += f"* **{feature}**: "
        if feature == 'days_to_first_funding':
            report += "Number of days between company founding and first funding round. Indicates how quickly a company can attract investment.\n"
        elif feature == 'funding_duration_days':
            report += "Duration between first and last funding rounds in days. Indicates the length of the fundraising journey.\n"
        elif feature == 'age_at_first_funding_years':
            report += "Company age (in years) when it received its first funding. Indicates how long it took to become investment-ready.\n"
        elif feature == 'days_from_creation_to_funding':
            report += "Days between company's creation in the database and first funding. May indicate visibility or market awareness.\n"
        elif feature == 'milestone_frequency':
            report += "Number of milestones achieved per year of existence. Indicates pace of notable achievements.\n"
        elif feature == 'days_between_milestones':
            report += "Average days between consecutive milestones. Indicates consistency of progress.\n"
        elif feature == 'lifespan_days':
            report += "Total days the company has existed (until reference date or closing). Indicates longevity.\n"

report += "\n### 2. Financial Features\n\n"
financial_features = [
    'funding_per_year', 'avg_funding_per_round', 'funding_rounds_per_year',
    'relationships_per_million', 'has_received_funding', 'funding_category'
]
report += "These features capture financial aspects and funding patterns:\n\n"
for feature in financial_features:
    if feature in new_features:
        report += f"* **{feature}**: "
        if feature == 'funding_per_year':
            report += "Total funding divided by company age. Indicates funding velocity or capital efficiency.\n"
        elif feature == 'avg_funding_per_round':
            report += "Average amount raised per funding round. Indicates typical round size.\n"
        elif feature == 'funding_rounds_per_year':
            report += "Number of funding rounds per year of existence. Indicates fundraising frequency.\n"
        elif feature == 'relationships_per_million':
            report += "Number of relationships per million dollars raised. Indicates network efficiency relative to funding.\n"
        elif feature == 'has_received_funding':
            report += "Binary indicator of whether the company has received any funding.\n"
        elif feature == 'funding_category':
            report += "Categorical grouping of funding amounts (No Funding, Seed, Early, Growth, Established).\n"

report += "\n### 3. Geographic Features\n\n"
geographic_features = [
    'is_in_tech_hub_city', 'is_in_tech_hub_region', 'is_us_based',
    'is_in_tech_state', 'region_group'
]
report += "These features capture location-based aspects that might influence success:\n\n"
for feature in geographic_features:
    if feature in new_features:
        report += f"* **{feature}**: "
        if feature == 'is_in_tech_hub_city':
            report += "Binary indicator of whether the company is located in a major tech hub city.\n"
        elif feature == 'is_in_tech_hub_region':
            report += "Binary indicator of whether the company is located in a major tech hub region.\n"
        elif feature == 'is_us_based':
            report += "Binary indicator of whether the company is based in the United States.\n"
        elif feature == 'is_in_tech_state':
            report += "Binary indicator of whether the company is in a major US tech state (CA, NY, MA, WA, TX).\n"
        elif feature == 'region_group':
            report += "Categorical grouping of countries into major economic/geographic regions.\n"

report += "\n### 4. Category and Industry Features\n\n"
category_features = [
    'is_tech', 'is_media', 'is_commerce', 'is_finance', 'is_health',
    'is_b2b', 'is_b2c', 'category_diversity'
]
report += "These features capture industry and business model characteristics:\n\n"
for feature in category_features:
    if feature in new_features:
        report += f"* **{feature}**: "
        if feature == 'is_tech':
            report += "Binary indicator of whether the company is in a technology category.\n"
        elif feature == 'is_media':
            report += "Binary indicator of whether the company is in a media-related category.\n"
        elif feature == 'is_commerce':
            report += "Binary indicator of whether the company is in a commerce-related category.\n"
        elif feature == 'is_finance':
            report += "Binary indicator of whether the company is in a finance-related category.\n"
        elif feature == 'is_health':
            report += "Binary indicator of whether the company is in a health-related category.\n"
        elif feature == 'is_b2b':
            report += "Binary indicator of whether the company has a business-to-business model.\n"
        elif feature == 'is_b2c':
            report += "Binary indicator of whether the company has a business-to-consumer model.\n"
        elif feature == 'category_diversity':
            report += "Count of different category types the company spans. Indicates diversification.\n"

report += "\n### 5. Text-based Features\n\n"
text_features = [
    'tech_keyword_count', 'market_keyword_count', 'business_keyword_count',
    'description_length', 'overview_length', 'tag_count'
]
report += "These features are derived from textual data and may indicate quality of company description or focus:\n\n"
for feature in text_features:
    if feature in new_features:
        report += f"* **{feature}**: "
        if feature == 'tech_keyword_count':
            report += "Count of technology-related keywords in the company overview. Indicates technical focus.\n"
        elif feature == 'market_keyword_count':
            report += "Count of market-related keywords in the company overview. Indicates market focus.\n"
        elif feature == 'business_keyword_count':
            report += "Count of business model-related keywords in the company overview. Indicates business model clarity.\n"
        elif feature == 'description_length':
            report += "Length of the company description. May indicate thoroughness of information.\n"
        elif feature == 'overview_length':
            report += "Length of the company overview. May indicate thoroughness of information.\n"
        elif feature == 'tag_count':
            report += "Number of tags associated with the company. Indicates breadth of categorization.\n"

report += "\n### 6. Network and Social Features\n\n"
network_features = [
    'has_twitter', 'has_website', 'relationship_density', 'has_strong_network'
]
report += "These features capture the company's online presence and network:\n\n"
for feature in network_features:
    if feature in new_features:
        report += f"* **{feature}**: "
        if feature == 'has_twitter':
            report += "Binary indicator of whether the company has a Twitter account.\n"
        elif feature == 'has_website':
            report += "Binary indicator of whether the company has a website.\n"
        elif feature == 'relationship_density':
            report += "Number of relationships per year of existence. Indicates networking pace.\n"
        elif feature == 'has_strong_network':
            report += "Binary indicator of whether the company has an above-average number of relationships.\n"

report += "\n### 7. Composite and Interaction Features\n\n"
composite_features = [
    'funding_age_ratio', 'tech_in_hub', 'funding_efficiency',
    'milestone_to_age_ratio', 'online_presence_score', 'text_quality_score'
]
report += "These features combine multiple aspects to capture complex patterns:\n\n"
for feature in composite_features:
    if feature in new_features:
        report += f"* **{feature}**: "
        if feature == 'funding_age_ratio':
            report += "Ratio of total funding to company age. Indicates funding efficiency over time.\n"
        elif feature == 'tech_in_hub':
            report += "Interaction between being a tech company and being in a tech hub. Captures potential synergies.\n"
        elif feature == 'funding_efficiency':
            report += "Amount of funding raised per funding round. Indicates fundraising efficiency.\n"
        elif feature == 'milestone_to_age_ratio':
            report += "Ratio of milestones to company age. Indicates achievement pace.\n"
        elif feature == 'online_presence_score':
            report += "Composite score of online presence indicators (Twitter, website). Indicates digital footprint.\n"
        elif feature == 'text_quality_score':
            report += "Composite score based on text length and keyword presence. Indicates quality of company description.\n"

report += "\n## Rationale and Expected Impact\n\n"
report += "The engineered features aim to capture various dimensions that might predict startup success:\n\n"
report += "1. **Speed and Efficiency**: Features like days_to_first_funding, funding_per_year, and milestone_frequency capture how quickly a company can achieve important milestones.\n\n"
report += "2. **Location Advantage**: Geographic features identify companies in established tech ecosystems that might benefit from network effects, talent pools, and investor proximity.\n\n"
report += "3. **Industry Trends**: Category-based features help identify which sectors have higher success rates.\n\n"
report += "4. **Network Effects**: Relationship-based features capture the company's connections, which can be crucial for partnerships, hiring, and fundraising.\n\n"
report += "5. **Communication Quality**: Text-based features may indicate how well a company can articulate its value proposition.\n\n"
report += "6. **Funding Patterns**: Financial features capture not just the amount of funding but patterns that might indicate investor confidence.\n\n"
report += "7. **Composite Indicators**: Interaction features capture complex relationships between multiple factors that might collectively influence success.\n\n"

report += "These engineered features significantly expand the predictive power of the dataset by transforming raw data into meaningful indicators that align with business intuition about startup success factors."

# Write the report to a file
with open('feature_engineering_report.md', 'w') as f:
    f.write(report)

print("Feature engineering report created and saved to 'feature_engineering_report.md'")


Creating feature engineering report...
Feature engineering report created and saved to 'feature_engineering_report.md'


In [16]:
# 12. Display summary statistics for key engineered features
print("\nSummary statistics for key engineered features:")

key_features = [
    'funding_per_year', 'days_to_first_funding', 'milestone_frequency',
    'funding_efficiency', 'relationship_density', 'text_quality_score'
]

summary_stats = df[key_features].describe()
print(summary_stats)

print("\nFeature engineering process completed successfully!")


Summary statistics for key engineered features:
       funding_per_year  days_to_first_funding  milestone_frequency  funding_efficiency  relationship_density  text_quality_score
count      1.965530e+05          196553.000000        196553.000000        1.965530e+05         196553.000000       196553.000000
mean       5.426132e+04             129.329087             0.020917        9.000434e+05              0.117813            1.764013
std        2.690541e+05             554.831897             0.045755        6.357153e+06              0.250928            1.564535
min        0.000000e+00          -17536.000000             0.000000        0.000000e+00              0.000000            0.000000
25%        0.000000e+00               0.000000             0.000000        0.000000e+00              0.080825            0.000000
50%        0.000000e+00               0.000000             0.000000        0.000000e+00              0.080825            2.000000
75%        0.000000e+00               0.0