In [None]:
import pandas as pd
import numpy as np
import pickle

def predict_new_data(acquiring_path, acquired_path, acquisitions_path, output_path):
    """
    Process new data and make predictions using saved models
    
    Args:
        acquiring_path: Path to new acquiring companies CSV
        acquired_path: Path to new acquired companies CSV  
        acquisitions_path: Path to new acquisitions CSV
        output_path: Where to save predictions
    """
    
    # ===================================
    # Load All Saved Preprocessing Objects
    # ===================================
    
    # Load acquiring company transformers
    with open('mlb_acquiring.pkl', 'rb') as f:
        mlb_acquiring = pickle.load(f)
    with open('tfidf_acquiring.pkl', 'rb') as f:
        tfidf_acquiring = pickle.load(f)
        
    # Load acquired company transformers
    with open('mlb_acquired.pkl', 'rb') as f:
        mlb_acquired = pickle.load(f)
    with open('label_encoders_acquired.pkl', 'rb') as f:
        label_encoders_acquired = pickle.load(f)
        
    # Load acquisitions transformers
    with open('ohe_acquisitions.pkl', 'rb') as f:
        ohe_acquisitions = pickle.load(f)
        
    # Load final preprocessing objects
    with open('final_imputer.pkl', 'rb') as f:
        final_imputer = pickle.load(f)
    with open('final_scaler.pkl', 'rb') as f:
        final_scaler = pickle.load(f)
    with open('final_pca.pkl', 'rb') as f:
        final_pca = pickle.load(f)
    with open('target_encoder.pkl', 'rb') as f:
        target_encoder = pickle.load(f)
        
    # Load model
    with open('final_model.pkl', 'rb') as f:
        model = pickle.load(f)
    
    # ==============================
    # Preprocess New Data (Same as Training)
    # ==============================
    
    # Process each dataset with saved transformers
    def process_acquiring_new(data):
        """Process new acquiring data with saved transformers"""
        data = data.copy()
        
        # Apply same cleaning as training
        data.drop(['CrunchBase Profile','Image','Homepage','Twitter','API'], 
                 axis=1, inplace=True, errors='ignore')
                
        data['Number of Employees'] = data['Number of Employees'].replace({',': ''}, regex=True)
        data['Number of Employees'] = data['Number of Employees'].fillna(0).astype(int)
        
        # Use mean from training (would need to save this)
        data['Number of Employees'] = data['Number of Employees'].replace(
            0, 450)  # Replace with saved mean from training
        
        # Handle IPO status
        data['IPO'] = data['IPO'].replace("Not yet", np.nan)
        data['Is_Public'] = data['IPO'].notna().astype(int)
        data.drop('IPO', axis=1, inplace=True)
        
        # Process Market Categories with saved MLB
        data['Market Categories'] = data['Market Categories'].fillna('')
        category_dummies = pd.DataFrame(
            mlb_acquiring.transform(data['Market Categories'].str.split(',')),
            columns=mlb_acquiring.classes_,
            index=data.index
        )
        
        # Process text with saved TF-IDF
        data['Text_Combined'] = data['Tagline'].fillna('') + ' ' + data['Description'].fillna('')
        tfidf_features = tfidf_acquiring.transform(data['Text_Combined'])
        tfidf_df = pd.DataFrame(
            tfidf_features.toarray(), 
            columns=tfidf_acquiring.get_feature_names_out(),
            index=data.index
        )
        
        # Final cleanup
        data = pd.concat([data, category_dummies, tfidf_df], axis=1)
        data.drop(['Market Categories', 'Tagline', 'Description', 'Text_Combined'], 
                 axis=1, inplace=True)
        
        return data
    
    # Similar functions for acquired and acquisitions...
    # (Implementation would mirror the training preprocessing but using saved transformers)
    
    # Preprocess each new dataset
    acquiring_new = process_acquiring_new(pd.read_csv(acquiring_path))
    acquired_new = process_acquired_new(pd.read_csv(acquired_path)) 
    acquisitions_new = process_acquisitions_new(pd.read_csv(acquisitions_path))
    
    # Merge the new data (same as training)
    final_new = merge_datasets(
        acquiring_new, acquired_new, acquisitions_new, save_artifacts=False
    )
    
    # Handle missing values with saved imputer
    final_new_imputed = pd.DataFrame(
        final_imputer.transform(final_new),
        columns=final_new.columns
    )
    
    # Prepare features
    X_new = final_new_imputed.drop(
        ['Deal size class', 'Acquired Company', 'Acquiring Company'], 
        axis=1, errors='ignore'
    )
    
    # Apply same scaling and PCA as training
    X_new_scaled = final_scaler.transform(X_new)
    X_new_pca = final_pca.transform(X_new_scaled)
    
    # Make predictions
    predictions_encoded = model.predict(X_new_pca)
    predictions = target_encoder.inverse_transform(predictions_encoded)
    
    # Save predictions with original data
    final_new['Predicted_Deal_Size'] = predictions
    final_new.to_csv(output_path, index=False)
    
    print(f"Predictions saved to {output_path}")

if __name__ == "__main__":
    predict_new_data(
        acquiring_path="new_acquiring.csv",
        acquired_path="new_acquired.csv",
        acquisitions_path="new_acquisitions.csv",
        output_path="new_predictions.csv"
    )