In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import pycountry

def predict_new_data(acquiring_path, acquired_path, acquisitions_path, output_path):
    """
    Process new data and make predictions using saved models
    
    Args:
        acquiring_path: Path to new acquiring companies CSV
        acquired_path: Path to new acquired companies CSV  
        acquisitions_path: Path to new acquisitions CSV
        output_path: Where to save predictions
    """
    
    # ===================================
    # Load All Saved Preprocessing Objects
    # ===================================
    
    # Load acquiring company transformers
    with open('mlb_acquiring.pkl', 'rb') as f:
        mlb_acquiring = pickle.load(f)
    
    with open("correlation_filter.pkl", "rb") as f:
        loaded_filter = pickle.load(f)



    #
    with open("category_reducer_acquiring.pkl", "rb") as f:
        loaded_reducer = pickle.load(f)

    #
    with open('Age_column_acquiring.pkl', 'rb') as f:
        age_mode_col =  pickle.load(f)
    #
    with open("ipo_transformer_acquiring.pkl", "rb") as f:
        loaded_ipo_transformer = pickle.load(f)
    #
    with open("employee_cleaner_acquiring.pkl", "rb") as f:
        loaded_employee_cleaner = pickle.load(f)


    with open('tfidf_acquiring.pkl', 'rb') as f:
        tfidf_acquiring = pickle.load(f)
        
    #
    with open("tagline_guesser_acquired.pkl", "rb") as f:
        loaded_guesser = pickle.load(f)

    #
    with open("category_generalizer_acquired.pkl", "rb") as f:
        loaded_generalizer = pickle.load(f)

    #
    with open("country_region_filler_acquired.pkl", "rb") as f:
        loaded_filler = pickle.load(f)
    
    #
    with open("categorical_encoder_acquired.pkl", "rb") as f:
        encoder = pickle.load(f)

    # Load acquired company transformers
    with open('mlb_acquired.pkl', 'rb') as f:
        mlb_acquired = pickle.load(f)
    with open('label_encoders_acquired.pkl', 'rb') as f:
        label_encoders_acquired = pickle.load(f)
        
    # Load acquisitions transformers
    with open('ohe_acquisitions.pkl', 'rb') as f:
        ohe_acquisitions = pickle.load(f)
        
    # Load final preprocessing objects
    with open('final_imputer.pkl', 'rb') as f:
        final_imputer = pickle.load(f)
    with open('final_scaler.pkl', 'rb') as f:
        final_scaler = pickle.load(f)
    with open('final_pca.pkl', 'rb') as f:
        final_pca = pickle.load(f)
    with open('target_encoder.pkl', 'rb') as f:
        target_encoder = pickle.load(f)
    
    # Load model
    with open('final_model.pkl', 'rb') as f:
        model = pickle.load(f)
    
    # ==============================
    # Preprocess New Data (Same as Training)
    # ==============================
    
    # Process each dataset with saved transformers
    def process_acquiring_new(data):
        """Process new acquiring data with saved transformers"""
        data = data.copy()
        
        # Apply same cleaning as training
        data.drop(['CrunchBase Profile','Image','Homepage','Twitter','API'], 
                 axis=1, inplace=True, errors='ignore')
                
        data['Number of Employees'] = data['Number of Employees'].replace({',': ''}, regex=True)
        data['Number of Employees'] = data['Number of Employees'].fillna(0).astype(int)
        
        # Use mean from training (would need to save this)
        data['Number of Employees'] = data['Number of Employees'].replace(
            0, 450)  # Replace with saved mean from training
        
        # Handle IPO status
        data['IPO'] = data['IPO'].replace("Not yet", np.nan)
        data['Is_Public'] = data['IPO'].notna().astype(int)
        data.drop('IPO', axis=1, inplace=True)

        # Process Market Categories with saved MLB
        data['Market Categories'] = data['Market Categories'].fillna('')
        category_dummies = pd.DataFrame(
            mlb_acquiring.transform(data['Market Categories'].str.split(',')),
            columns=mlb_acquiring.classes_,
            index=data.index
        )
        
        #
        data = loaded_filter.transform(data)


        # Use the loaded reducer on new data
        data = loaded_reducer.transform(data)

        #
        data = age_mode_col.transform(data)

        #
        data = loaded_ipo_transformer.transform(data)

        #
        data = loaded_employee_cleaner.transform(data)



        # Process text with saved TF-IDF
        data['Text_Combined'] = data['Tagline'].fillna('') + ' ' + data['Description'].fillna('')
        def clean_text(text):
            text = text.lower()
            text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
            text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
            return text

        data['Text_Combined'] = data['Text_Combined'].apply(clean_text)
        tfidf_features = tfidf_acquiring.transform(data['Text_Combined'])
        tfidf_df = pd.DataFrame(
            tfidf_features.toarray(), 
            columns=tfidf_acquiring.get_feature_names_out(),
            index=data.index
        )
        
        # Final cleanup
        data = pd.concat([data, category_dummies, tfidf_df], axis=1)
        data.drop(['Market Categories', 'Tagline', 'Description', 'Text_Combined','Address (HQ)',
                   'Board Members','Founders'], 
                 axis=1, inplace=True)
        
        return data
    
    # Similar functions for acquired and acquisitions...
    # (Implementation would mirror the training preprocessing but using saved transformers)
    

    def process_acquisitions_new(data):
        """Process new acquisitions data with saved transformers"""
        data = data.copy()
        
        # Load saved preprocessing objects
        with open('custom_acquisitions_encoder.pkl', 'rb') as f:
            encoder = pickle.load(f)
        with open('mode_acquisitions_imputer.pkl', 'rb') as f:
            modes = pickle.load(f)
        
        # Drop columns same as training
        data.drop(columns=["Acquisition Profile", "News", "News Link"], 
                inplace=True, errors='ignore')
        
        # Handle missing values with saved modes
        for col, mode_val in modes.items():
            if col in data.columns and mode_val is not None:
                data[col].fillna(mode_val, inplace=True)
        
        # Convert date features same as training
        if 'Deal announced on' in data.columns:
            data['Deal_date'] = pd.to_datetime(data['Deal announced on'], dayfirst=True, errors='coerce')
            data['Deal_day'] = data['Deal_date'].dt.day
            data['Deal_month'] = data['Deal_date'].dt.month
            data['Deal_dayofweek'] = data['Deal_date'].dt.dayofweek
            data.drop(['Deal announced on', 'Deal_date'], axis=1, inplace=True)
        
        # Apply saved one-hot encoding
        encoded_status_terms = encoder.transform(data)
        data = pd.concat([data, encoded_status_terms], axis=1)
        
        return data

    def process_acquired_new(data):
        """Process new acquiring data with saved transformers"""
        data = data.copy()

        data = loaded_guesser.transform(data)

        data = loaded_generalizer.transform(data)

        data = loaded_filler.transform(data)

        data = encoder.transform(data)
        
        data['Acquired by'].fillna('Salesforce', inplace=True)


        columns_to_drop = ['Image', 'CrunchBase Profile', 'Homepage', 'Twitter','Address (HQ)','API','Description',
                           'Tagline','Market Categories','City (HQ)', 'State / Region (HQ)', 'Country (HQ)','Generalized Market Categories','Year Founded']
        data.drop(columns=columns_to_drop, inplace=True)
        
        return data 
    
    
    
    # Preprocess each new dataset
    acquiring_new = process_acquiring_new(pd.read_csv(acquiring_path))
    acquired_new = process_acquired_new(pd.read_csv(acquired_path)) 
    acquisitions_new = process_acquisitions_new(pd.read_csv(acquisitions_path))
    
    # Merge the new data (same as training)
    final_new = merge_datasets(
        acquiring_new, acquired_new, acquisitions_new, save_artifacts=False
    )
    
    # Handle missing values with saved imputer
    final_new_imputed = pd.DataFrame(
        final_imputer.transform(final_new),
        columns=final_new.columns
    )
    
    # Prepare features
    X_new = final_new_imputed.drop(
        ['Deal size class', 'Acquired Company', 'Acquiring Company'], 
        axis=1, errors='ignore'
    )
    
    # Apply same scaling and PCA as training
    X_new_scaled = final_scaler.transform(X_new)
    X_new_pca = final_pca.transform(X_new_scaled)
    
    # Make predictions
    predictions_encoded = model.predict(X_new_pca)
    predictions = target_encoder.inverse_transform(predictions_encoded)
    
    # Save predictions with original data
    final_new['Predicted_Deal_Size'] = predictions
    final_new.to_csv(output_path, index=False)
    
    print(f"Predictions saved to {output_path}")

if __name__ == "__main__":
    predict_new_data(
        acquiring_path="new_acquiring.csv",
        acquired_path="new_acquired.csv",
        acquisitions_path="new_acquisitions.csv",
        output_path="new_predictions.csv"
    )

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


FileNotFoundError: [Errno 2] No such file or directory: 'correlation_filter.pkl'

In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox

def browse_directory(entry_widget):
    directory = filedialog.askdirectory()
    if directory:
        entry_widget.delete(0, tk.END)
        entry_widget.insert(0, directory)

def predict_data():
    # Get input values from entries
    paths = [
        entry1.get(),
        entry2.get(),
        entry3.get()
    ]
    
    # Validate paths
    for path in paths:
        if not path.strip():
            messagebox.showerror("Error", "All paths must be selected!")
            return
    
    # Call your prediction function
    try:
        predict_new_data(*paths, "new_predictions.csv")
        messagebox.showinfo("Success", "Prediction completed successfully!")
    except Exception as e:
        messagebox.showerror("Error", f"Prediction failed: {str(e)}")

# Create main window
root = tk.Tk()
root.title("Predictor")
root.geometry("1200x800")

# Function to create consistent input fields with browse buttons
def create_file_input(row, label_text):
    # Label
    tk.Label(root, text=label_text).grid(row=row, column=0, padx=10, pady=5, sticky=tk.W)
    
    # Entry field
    entry = tk.Entry(root, width=80)
    entry.grid(row=row, column=1, padx=10, pady=5)
    
    # Browse button
    browse_btn = tk.Button(root, text="Browse", 
                          command=lambda: browse_directory(entry))
    browse_btn.grid(row=row, column=2, padx=10, pady=5)
    
    return entry

# Create input fields with browse buttons
entry1 = create_file_input(0, "Acquiring Path:")
entry2 = create_file_input(1, "Acquired Path:")
entry3 = create_file_input(2, "Acquisitions Path:")

# Prediction button
predict_button = tk.Button(root, text="Predict", command=predict_data,
                          bg="#4CAF50", fg="white", font=("Arial", 12))
predict_button.grid(row=3, column=0, columnspan=3, pady=20, ipadx=10, ipady=5)

# Configure grid layout
root.grid_columnconfigure(1, weight=1)  # Make entry fields expandable

# Start the GUI
root.mainloop()