In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import os
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

print("Starting Train-Test Split and Feature Scaling step...")

Starting Train-Test Split and Feature Scaling step...


In [None]:
# Load the preprocessed data
print("Loading preprocessed data...")
try:
    # First try to load with headers, as this is most common
    data = pd.read_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/usa_companies_with_success_labels.csv')
    print("Data loaded with headers.")
    print(f"Data shape: {data.shape}")
    print("First few rows:")
    print(data.head())

except Exception as e:
    # Handle other potential errors
    print(f"Error loading data: {e}")
    raise

# Display basic information about the data
print("\nData information:")
print(f"Number of samples: {data.shape[0]}")
print(f"Number of features: {data.shape[1]}")
print(f"Column names: {list(data.columns)}")

Loading preprocessed data...
Data loaded with headers.
Data shape: (51637, 26)
First few rows:
  entity_type               name category_code     status  founded_at  \
0     Company           Wetpaint           web  operating  2005-10-17   
1     Company            Flektor   games_video   acquired         NaN   
2     Company              There   games_video   acquired         NaN   
3     Company  Thomas Publishing   advertising  operating         NaN   
4     Company    dimension5 labs   advertising  operating  2008-08-01   

  country_code state_code         city       region first_funding_at  ...  \
0          USA         WA      Seattle      Seattle       2005-10-01  ...   
1          USA         CA  Culver City  Los Angeles              NaN  ...   
2          USA         CA    San Mateo       SF Bay              NaN  ...   
3          USA         NY     New York     New York              NaN  ...   
4          USA         NM     Santa Fe     Santa Fe              NaN  ...   

  r

In [6]:
# Check for missing values
print("\nChecking for missing values...")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

# Display basic statistics
print("\nBasic statistics of the data:")
print(data.describe())


Checking for missing values...
name                      2
category_code          4313
founded_at            13012
state_code              975
city                   1453
first_funding_at      32004
last_funding_at       32004
funding_rounds        31905
funding_total_usd     33495
first_milestone_at    24967
last_milestone_at     24967
milestones            24967
relationships         13475
age_years             13012
dtype: int64

Basic statistics of the data:
       funding_rounds  funding_total_usd    milestones  relationships  \
count    19732.000000       1.814200e+04  26670.000000   38162.000000   
mean         1.833773       1.711986e+07      1.456655       4.948561   
std          1.347464       7.802633e+07      0.774161      15.674637   
min          1.000000       2.910000e+02      1.000000       1.000000   
25%          1.000000       7.500000e+05      1.000000       1.000000   
50%          1.000000       3.500000e+06      1.000000       3.000000   
75%          2.000000

In [7]:
# Separate features and target variable
# Assuming the last column is the target variable (success_binary)
print("\nSeparating features and target variable...")

# Check if the data has column names
if isinstance(data.columns[0], str) and not data.columns[0].isdigit():
    # Data has column names
    print("Using column names to identify target variable...")

    # Look for common target column names
    target_columns = ['success_binary', 'target', 'label', 'class', 'y']
    target_col = None

    for col in target_columns:
        if col in data.columns:
            target_col = col
            break

    if target_col:
        print(f"Found target column: {target_col}")
        X = data.drop(columns=[target_col])
        y = data[target_col]
    else:
        print("No standard target column found. Assuming last column is target.")
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]
else:
    # Data doesn't have column names
    print("No column names found. Assuming last column is target.")
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target value counts:\n{y.value_counts()}")


Separating features and target variable...
Using column names to identify target variable...
Found target column: success_binary
Features shape: (51637, 25)
Target shape: (51637,)
Target value counts:
success_binary
0    35404
1    16233
Name: count, dtype: int64


In [8]:
# Split the data into training and testing sets
print("\nSplitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Training target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Testing target distribution:\n{y_test.value_counts(normalize=True)}")


Splitting data into training and testing sets...
Training set shape: (41309, 25)
Testing set shape: (10328, 25)
Training target distribution:
success_binary
0    0.685638
1    0.314362
Name: proportion, dtype: float64
Testing target distribution:
success_binary
0    0.685612
1    0.314388
Name: proportion, dtype: float64


In [None]:
# # Scale the features
# print("\nScaling features...")
# scaler = StandardScaler()

# # Fit the scaler on the training data and transform both training and testing data
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Convert back to DataFrame to preserve column names if they exist
# if isinstance(X_train, pd.DataFrame) and X_train.columns.dtype == 'object':
#     X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
#     X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
# else:
#     X_train_scaled = pd.DataFrame(X_train_scaled)
#     X_test_scaled = pd.DataFrame(X_test_scaled)

# # Check the scaled data
# print("First few rows of scaled training data:")
# print(X_train_scaled.head())


Scaling features...


ValueError: could not convert string to float: 'Company'

In [10]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Scale the features
print("\nScaling numerical features only...")

# Identify numerical columns
numeric_columns = X_train.select_dtypes(include=['int64', 'float64']).columns

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on training data's numeric columns
X_train_scaled_numeric = scaler.fit_transform(X_train[numeric_columns])

# Apply the same transformation to test data
X_test_scaled_numeric = scaler.transform(X_test[numeric_columns])

# Copy original DataFrames to preserve non-numeric columns
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Replace numeric columns with their scaled versions
X_train_scaled[numeric_columns] = X_train_scaled_numeric
X_test_scaled[numeric_columns] = X_test_scaled_numeric

# Check the scaled data
print("First few rows of scaled training data:")
print(X_train_scaled.head())



Scaling numerical features only...
First few rows of scaled training data:
      entity_type                 name category_code     status  founded_at  \
26227     Company          Culture Jam           web  operating  2008-07-01   
8193      Company              Namella   advertising  operating  2011-10-01   
48112     Company      Unique Contacts   advertising  operating  2008-11-10   
44574     Company    EVida Power, Inc.     cleantech  operating  2008-01-01   
8804      Company  Elusys Therapeutics       biotech  operating         NaN   

      country_code state_code           city          region first_funding_at  \
26227          USA         CA    Los Angeles     Los Angeles       2008-07-01   
8193           USA         CA     Costa Mesa     Los Angeles              NaN   
48112          USA         NV            NaN  Nevada - Other              NaN   
44574          USA         CA  San Francisco          SF Bay              NaN   
8804           USA         NJ     Pine Brook

In [15]:
# Save the scaled training and testing sets
print("\nSaving scaled data and target variables...")

# Save X_train_scaled
X_train_scaled.to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/X_train.csv', index=False)
print("Saved X_train.csv")

# Save X_test_scaled
X_test_scaled.to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/X_test.csv', index=False)
print("Saved X_test.csv")

# Save y_train
if isinstance(y_train, pd.Series):
    y_train.to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/y_train.csv', index=False, header=True)
else:
    pd.DataFrame(y_train).to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/y_train.csv', index=False, header=False)
print("Saved y_train.csv")

# Save y_test
if isinstance(y_test, pd.Series):
    y_test.to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/y_test.csv', index=False, header=True)
else:
    pd.DataFrame(y_test).to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/y_test.csv', index=False, header=False)
print("Saved y_test.csv")

# Save the scaler
with open('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Saved scaler.pkl")

print("\nTrain-Test Split and Feature Scaling step completed successfully!")


Saving scaled data and target variables...
Saved X_train.csv
Saved X_test.csv
Saved y_train.csv
Saved y_test.csv
Saved scaler.pkl

Train-Test Split and Feature Scaling step completed successfully!


In [16]:
# Print summary of the outputs
print("\nSummary of outputs:")
print(f"X_train.csv: {X_train_scaled.shape[0]} samples, {X_train_scaled.shape[1]} features")
print(f"X_test.csv: {X_test_scaled.shape[0]} samples, {X_test_scaled.shape[1]} features")
print(f"y_train.csv: {len(y_train)} labels")
print(f"y_test.csv: {len(y_test)} labels")
print("scaler.pkl: StandardScaler object")
print("scaling_effect.png: Visualization of scaling effect on features")

# Check if the files were created successfully
for filename in ['X_train.csv', 'X_test.csv', 'y_train.csv', 'y_test.csv', 'scaler.pkl']:
    if os.path.exists(filename):
        print(f"{filename}: ✓ (File exists)")
    else:
        print(f"{filename}: ✗ (File does not exist)")


Summary of outputs:
X_train.csv: 41309 samples, 25 features
X_test.csv: 10328 samples, 25 features
y_train.csv: 41309 labels
y_test.csv: 10328 labels
scaler.pkl: StandardScaler object
scaling_effect.png: Visualization of scaling effect on features
X_train.csv: ✗ (File does not exist)
X_test.csv: ✗ (File does not exist)
y_train.csv: ✗ (File does not exist)
y_test.csv: ✗ (File does not exist)
scaler.pkl: ✗ (File does not exist)
