In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Set random seed for reproducibility
np.random.seed(42)

print("Starting Stratified K-Fold Cross Validation and Feature Scaling...")

Starting Stratified K-Fold Cross Validation and Feature Scaling...


In [None]:
# Load the preprocessed data
print("Loading preprocessed data...")
try:
    # First try to load with headers, as this is most common
    data = pd.read_csv('usa_final_companies_with_success_labels.csv')
    print("Data loaded with headers.")
    print(f"Data shape: {data.shape}")
    print("First few rows:")
    print(data.head())

except Exception as e:
    # Handle other potential errors
    print(f"Error loading data: {e}")
    raise

# Display basic information about the data
print("\nData information:")
print(f"Number of samples: {data.shape[0]}")
print(f"Number of features: {data.shape[1]}")
print(f"Column names: {list(data.columns)}")

# Check for missing values
print("\nChecking for missing values...")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

# Separate features and target variable
print("\nSeparating features and target variable...")

# Check if the data has column names
if isinstance(data.columns[0], str) and not data.columns[0].isdigit():
    # Data has column names
    print("Using column names to identify target variable...")

    # Look for common target column names
    target_columns = ['success_binary', 'target', 'label', 'class', 'y']
    target_col = None

    for col in target_columns:
        if col in data.columns:
            target_col = col
            break

    if target_col:
        print(f"Found target column: {target_col}")
        X = data.drop(columns=[target_col])
        y = data[target_col]
    else:
        print("No standard target column found. Assuming last column is target.")
        X = data.iloc[:, :-1]
        y = data.iloc[:, -1]
else:
    # Data doesn't have column names
    print("No column names found. Assuming last column is target.")
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target value counts:\n{y.value_counts()}")

Loading preprocessed data...
Data loaded with headers.
Data shape: (51613, 5)
First few rows:
   funding_rounds  funding_total_usd  milestones  relationships  \
0             3.0         39750000.0         5.0           17.0   
1             0.0                0.0         0.0            6.0   
2             0.0                0.0         4.0           12.0   
3             0.0                0.0         1.0            2.0   
4             0.0                0.0         1.0            2.0   

   success_binary  
0               0  
1               1  
2               1  
3               0  
4               0  

Data information:
Number of samples: 51613
Number of features: 5
Column names: ['funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'success_binary']

Checking for missing values...
Series([], dtype: int64)

Separating features and target variable...
Using column names to identify target variable...
Found target column: success_binary
Features shape: (51613, 4)


In [5]:
# Implement stratified K-fold cross-validation
print("\nImplementing Stratified K-Fold Cross Validation...")

# Define the number of folds
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize lists to store results
fold_accuracies = []
fold_precisions = []
fold_recalls = []
fold_f1s = []

# Initialize lists to store the scaled data for each fold
X_train_all_folds = []
X_test_all_folds = []
y_train_all_folds = []
y_test_all_folds = []
scalers = []

print(f"\nPerforming {n_folds}-fold cross-validation...")

# Perform K-fold cross-validation
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    print(f"\nProcessing fold {fold+1}/{n_folds}...")
    
    # Split the data for this fold
    X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
    y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]
    
    # Scale the features
    print(f"Scaling features for fold {fold+1}...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_test_scaled = scaler.transform(X_test_fold)
    
    # Convert back to DataFrame to preserve column names
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_fold.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_fold.columns)
    
    # Save the fold data
    X_train_all_folds.append(X_train_scaled)
    X_test_all_folds.append(X_test_scaled)
    y_train_all_folds.append(y_train_fold)
    y_test_all_folds.append(y_test_fold)
    scalers.append(scaler)
    
    # Print fold statistics
    print(f"Fold {fold+1} training set shape: {X_train_scaled.shape}")
    print(f"Fold {fold+1} testing set shape: {X_test_scaled.shape}")
    print(f"Fold {fold+1} training target distribution:\n{y_train_fold.value_counts(normalize=True)}")
    print(f"Fold {fold+1} testing target distribution:\n{y_test_fold.value_counts(normalize=True)}")

# Save all fold data
output_dir = '/Users/aminosaurier/Downloads/spring_2025_startup_survival/new-analysis-relationships-0/'

print("\nSaving fold data...")
for fold in range(n_folds):
    # Save X_train for this fold
    X_train_all_folds[fold].to_csv(f'{output_dir}X_train_fold{fold+1}.csv', index=False)
    
    # Save X_test for this fold
    X_test_all_folds[fold].to_csv(f'{output_dir}X_test_fold{fold+1}.csv', index=False)
    
    # Save y_train for this fold
    if isinstance(y_train_all_folds[fold], pd.Series):
        y_train_all_folds[fold].to_csv(f'{output_dir}y_train_fold{fold+1}.csv', index=False, header=True)
    else:
        pd.DataFrame(y_train_all_folds[fold]).to_csv(f'{output_dir}y_train_fold{fold+1}.csv', index=False, header=False)
    
    # Save y_test for this fold
    if isinstance(y_test_all_folds[fold], pd.Series):
        y_test_all_folds[fold].to_csv(f'{output_dir}y_test_fold{fold+1}.csv', index=False, header=True)
    else:
        pd.DataFrame(y_test_all_folds[fold]).to_csv(f'{output_dir}y_test_fold{fold+1}.csv', index=False, header=False)
    
    # Save the scaler for this fold
    with open(f'{output_dir}scaler_fold{fold+1}.pkl', 'wb') as f:
        pickle.dump(scalers[fold], f)
        
    print(f"Saved fold {fold+1} data")

print("\nStratified K-Fold Cross Validation and Feature Scaling completed successfully!")


Implementing Stratified K-Fold Cross Validation...

Performing 5-fold cross-validation...

Processing fold 1/5...
Scaling features for fold 1...
Fold 1 training set shape: (41290, 4)
Fold 1 testing set shape: (10323, 4)
Fold 1 training target distribution:
success_binary
0    0.677428
1    0.322572
Name: proportion, dtype: float64
Fold 1 testing target distribution:
success_binary
0    0.677419
1    0.322581
Name: proportion, dtype: float64

Processing fold 2/5...
Scaling features for fold 2...
Fold 2 training set shape: (41290, 4)
Fold 2 testing set shape: (10323, 4)
Fold 2 training target distribution:
success_binary
0    0.677428
1    0.322572
Name: proportion, dtype: float64
Fold 2 testing target distribution:
success_binary
0    0.677419
1    0.322581
Name: proportion, dtype: float64

Processing fold 3/5...
Scaling features for fold 3...
Fold 3 training set shape: (41290, 4)
Fold 3 testing set shape: (10323, 4)
Fold 3 training target distribution:
success_binary
0    0.677428
1  

In [6]:
# Print summary of the outputs
print("\nSummary of outputs:")
for fold in range(n_folds):
    print(f"\nFold {fold+1}:")
    print(f"X_train_fold{fold+1}.csv: {X_train_all_folds[fold].shape[0]} samples, {X_train_all_folds[fold].shape[1]} features")
    print(f"X_test_fold{fold+1}.csv: {X_test_all_folds[fold].shape[0]} samples, {X_test_all_folds[fold].shape[1]} features")
    print(f"y_train_fold{fold+1}.csv: {len(y_train_all_folds[fold])} labels")
    print(f"y_test_fold{fold+1}.csv: {len(y_test_all_folds[fold])} labels")
    print(f"scaler_fold{fold+1}.pkl: StandardScaler object")

# Create a visualization of the feature distributions across folds
plt.figure(figsize=(14, 8))

# Select a few features to visualize (first 5)
features_to_plot = min(5, X.shape[1])
feature_names = list(X.columns[:features_to_plot])

for i, feature in enumerate(feature_names):
    plt.subplot(2, 3, i+1)
    for fold in range(n_folds):
        plt.hist(X_test_all_folds[fold][feature], alpha=0.5, bins=20, label=f'Fold {fold+1}')
    plt.title(f'Distribution of {feature} across folds')
    plt.legend()

plt.tight_layout()
plt.savefig(f'{output_dir}feature_distributions_across_folds.png')
plt.close()

print(f"\nSaved visualization of feature distributions to feature_distributions_across_folds.png")

# Check if the files were created successfully
print("\nChecking if files were created successfully:")
for fold in range(n_folds):
    for filename in [f'X_train_fold{fold+1}.csv', f'X_test_fold{fold+1}.csv', 
                     f'y_train_fold{fold+1}.csv', f'y_test_fold{fold+1}.csv', 
                     f'scaler_fold{fold+1}.pkl']:
        full_path = os.path.join(output_dir, filename)
        if os.path.exists(full_path):
            print(f"{filename}: ✓ (File exists)")
        else:
            print(f"{filename}: ✗ (File does not exist)")


Summary of outputs:

Fold 1:
X_train_fold1.csv: 41290 samples, 4 features
X_test_fold1.csv: 10323 samples, 4 features
y_train_fold1.csv: 41290 labels
y_test_fold1.csv: 10323 labels
scaler_fold1.pkl: StandardScaler object

Fold 2:
X_train_fold2.csv: 41290 samples, 4 features
X_test_fold2.csv: 10323 samples, 4 features
y_train_fold2.csv: 41290 labels
y_test_fold2.csv: 10323 labels
scaler_fold2.pkl: StandardScaler object

Fold 3:
X_train_fold3.csv: 41290 samples, 4 features
X_test_fold3.csv: 10323 samples, 4 features
y_train_fold3.csv: 41290 labels
y_test_fold3.csv: 10323 labels
scaler_fold3.pkl: StandardScaler object

Fold 4:
X_train_fold4.csv: 41291 samples, 4 features
X_test_fold4.csv: 10322 samples, 4 features
y_train_fold4.csv: 41291 labels
y_test_fold4.csv: 10322 labels
scaler_fold4.pkl: StandardScaler object

Fold 5:
X_train_fold5.csv: 41291 samples, 4 features
X_test_fold5.csv: 10322 samples, 4 features
y_train_fold5.csv: 41291 labels
y_test_fold5.csv: 10322 labels
scaler_fold5.p

In [None]:
# Creates a 5-fold stratified cross-validation setup (you can adjust n_folds as needed)
# Maintains the data splitting approach from your original code
# Scales features independently for each fold
# Saves the training/testing data and scalers for each fold separately
# Creates a visualization of feature distributions across folds
# Provides detailed summary statistics for each fold
# Using stratified K-fold cross-validation will give you more reliable performance estimates compared to a single train-test split, especially for imbalanced datasets.