In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import os
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

print("Starting Train-Test Split and Feature Scaling step...")

Starting Train-Test Split and Feature Scaling step...


In [2]:
# Load the preprocessed data
print("Loading preprocessed data...")
try:
    # First try to load with headers, as this is most common
    data = pd.read_csv('usa_final_companies_with_success_labels.csv')
    print("Data loaded with headers.")
    print(f"Data shape: {data.shape}")
    print("First few rows:")
    print(data.head())

except Exception as e:
    # Handle other potential errors
    print(f"Error loading data: {e}")
    raise

# Display basic information about the data
print("\nData information:")
print(f"Number of samples: {data.shape[0]}")
print(f"Number of features: {data.shape[1]}")
print(f"Column names: {list(data.columns)}")

Loading preprocessed data...
Data loaded with headers.
Data shape: (51613, 6)
First few rows:
   funding_rounds  funding_total_usd  milestones  relationships   age  \
0             3.0         39750000.0         5.0           17.0  9.20   
1             0.0                0.0         0.0            6.0  7.00   
2             0.0                0.0         4.0           12.0  7.00   
3             0.0                0.0         1.0            2.0  7.00   
4             0.0                0.0         1.0            2.0  6.41   

   success_binary  
0               0  
1               1  
2               1  
3               0  
4               0  

Data information:
Number of samples: 51613
Number of features: 6
Column names: ['funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'age', 'success_binary']


In [3]:
# Check for missing values
print("\nChecking for missing values...")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

# Display basic statistics
print("\nBasic statistics of the data:")
print(data.describe())


Checking for missing values...
Series([], dtype: int64)

Basic statistics of the data:
       funding_rounds  funding_total_usd   milestones  relationships  \
count    51613.000000       5.161300e+04  51613.00000   51613.000000   
mean         0.700657       5.695137e+06      0.75233       3.918548   
std          1.219888       2.886024e+07      0.91625      13.589127   
min          0.000000       0.000000e+00      0.00000       1.000000   
25%          0.000000       0.000000e+00      0.00000       1.000000   
50%          0.000000       0.000000e+00      1.00000       2.000000   
75%          1.000000       1.000000e+06      1.00000       4.000000   
max         14.000000       8.339523e+08      9.00000    1189.000000   

                age  success_binary  
count  51613.000000    51613.000000  
mean       9.399257        0.324201  
std        9.423326        0.468080  
min        0.000000        0.000000  
25%        5.000000        0.000000  
50%        7.000000        0.000000

In [4]:
# Separate features and target variable
# Assuming the last column is the target variable (success_binary)
print("\nSeparating features and target variable...")

# Check if the data has column names
if isinstance(data.columns[0], str) and not data.columns[0].isdigit():
    # Data has column names
    print("Using column names to identify target variable...")

    # Look for common target column names
    target_columns = ['success_binary', 'target', 'label', 'class', 'y']
    target_col = None

    for col in target_columns:
        if col in data.columns:
            target_col = col
            break

    if target_col:
        print(f"Found target column: {target_col}")
        X = data.drop(columns=[target_col])
        y = data[target_col]
    else:
        print("No standard target column found. Assuming last column is target.")
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]
else:
    # Data doesn't have column names
    print("No column names found. Assuming last column is target.")
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target value counts:\n{y.value_counts()}")


Separating features and target variable...
Using column names to identify target variable...
Found target column: success_binary
Features shape: (51613, 5)
Target shape: (51613,)
Target value counts:
success_binary
0    34880
1    16733
Name: count, dtype: int64


In [5]:
# Split the data into training and testing sets
print("\nSplitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(f"Training target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Testing target distribution:\n{y_test.value_counts(normalize=True)}")


Splitting data into training and testing sets...
Training set shape: (41290, 5)
Testing set shape: (10323, 5)
Training target distribution:
success_binary
0    0.675805
1    0.324195
Name: proportion, dtype: float64
Testing target distribution:
success_binary
0    0.675773
1    0.324227
Name: proportion, dtype: float64


In [6]:
# Scale the features
print("\nScaling features...")
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame to preserve column names if they exist
if isinstance(X_train, pd.DataFrame) and X_train.columns.dtype == 'object':
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
else:
    X_train_scaled = pd.DataFrame(X_train_scaled)
    X_test_scaled = pd.DataFrame(X_test_scaled)

# Check the scaled data
print("First few rows of scaled training data:")
print(X_train_scaled.head())


Scaling features...
First few rows of scaled training data:
   funding_rounds  funding_total_usd  milestones  relationships       age
0        1.061713           0.203911    2.440358       0.002626  0.694242
1       -0.573306          -0.199304   -0.821018      -0.201348 -0.255226
2       -0.573306          -0.199304   -0.821018      -0.065365  0.061264
3        1.061713           4.647531    0.266107       1.294464  2.698674
4        1.061713          -0.160714   -0.821018      -0.065365 -0.571715


In [7]:
# Save the scaled training and testing sets
print("\nSaving scaled data and target variables...")

# Save X_train_scaled
X_train_scaled.to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/X_train.csv', index=False)
print("Saved X_train.csv")

# Save X_test_scaled
X_test_scaled.to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/X_test.csv', index=False)
print("Saved X_test.csv")

# Save y_train
if isinstance(y_train, pd.Series):
    y_train.to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/y_train.csv', index=False, header=True)
else:
    pd.DataFrame(y_train).to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/y_train.csv', index=False, header=False)
print("Saved y_train.csv")

# Save y_test
if isinstance(y_test, pd.Series):
    y_test.to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/y_test.csv', index=False, header=True)
else:
    pd.DataFrame(y_test).to_csv('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/y_test.csv', index=False, header=False)
print("Saved y_test.csv")

# Save the scaler
with open('/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Saved scaler.pkl")

print("\nTrain-Test Split and Feature Scaling step completed successfully!")


Saving scaled data and target variables...
Saved X_train.csv
Saved X_test.csv
Saved y_train.csv
Saved y_test.csv
Saved scaler.pkl

Train-Test Split and Feature Scaling step completed successfully!


In [8]:
# Print summary of the outputs
print("\nSummary of outputs:")
print(f"X_train.csv: {X_train_scaled.shape[0]} samples, {X_train_scaled.shape[1]} features")
print(f"X_test.csv: {X_test_scaled.shape[0]} samples, {X_test_scaled.shape[1]} features")
print(f"y_train.csv: {len(y_train)} labels")
print(f"y_test.csv: {len(y_test)} labels")
print("scaler.pkl: StandardScaler object")
print("scaling_effect.png: Visualization of scaling effect on features")

# Check if the files were created successfully
for filename in ['X_train.csv', 'X_test.csv', 'y_train.csv', 'y_test.csv', 'scaler.pkl']:
    if os.path.exists(filename):
        print(f"{filename}: ✓ (File exists)")
    else:
        print(f"{filename}: ✗ (File does not exist)")


Summary of outputs:
X_train.csv: 41290 samples, 5 features
X_test.csv: 10323 samples, 5 features
y_train.csv: 41290 labels
y_test.csv: 10323 labels
scaler.pkl: StandardScaler object
scaling_effect.png: Visualization of scaling effect on features
X_train.csv: ✓ (File exists)
X_test.csv: ✓ (File exists)
y_train.csv: ✓ (File exists)
y_test.csv: ✓ (File exists)
scaler.pkl: ✓ (File exists)
