In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle

# 1. Load Data
df = pd.read_csv('/kaggle/input/datasets/chaitanyapotnurii/dataset/startup data.csv')

# 2. Preprocessing & Feature Selection
# Fill missing values for milestones
df['age_first_milestone_year'] = df['age_first_milestone_year'].fillna(0)
df['age_last_milestone_year'] = df['age_last_milestone_year'].fillna(0)

# Map target: acquired -> 1, closed -> 0
df['status'] = df['status'].map({'acquired': 1, 'closed': 0})

# CRITICAL FIX: Select only Numeric columns for X
# This removes all names, dates, and ID strings automatically
X = df.select_dtypes(include=[np.number]).drop(columns=['status', 'Unnamed: 0', 'labels'], errors='ignore')
y = df['status']

# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Train Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. Save Model
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("--- Success! ---")
print(f"Model trained using {len(X.columns)} numeric features.")
print(f"Features list: {X.columns.tolist()}")

--- Success! ---
Model trained using 33 numeric features.
Features list: ['latitude', 'longitude', 'age_first_funding_year', 'age_last_funding_year', 'age_first_milestone_year', 'age_last_milestone_year', 'relationships', 'funding_rounds', 'funding_total_usd', 'milestones', 'is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate', 'is_software', 'is_web', 'is_mobile', 'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce', 'is_biotech', 'is_consulting', 'is_othercategory', 'has_VC', 'has_angel', 'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD', 'avg_participants', 'is_top500']
