In [1]:
# ======================== IMPORTS ========================
from collections import Counter
from joblib import dump
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.svm import SVC
from sklearn.svm import SVR
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import pycountry
import re
import seaborn as sns


In [2]:

COLUMN_ALIASES = {
    'IPO Date': 'IPO',
    'Went Public': 'IPO',
    'Public Listing': 'IPO'
}

def ensure_ipo(df):
    df.rename(columns=COLUMN_ALIASES, inplace=True)
    if 'IPO' not in df.columns:
        df['IPO'] = np.nan
    return df
# ======================== CLASSES ========================
class CorrelationFilter:
    def __init__(self, threshold=0.85):
        self.threshold = threshold
        self.to_drop = set()  # Stores columns to drop
        self.fitted = False   # Tracks if fit() was called

    def fit(self, data):
        """Identifies highly correlated columns to drop."""
        numerical_data = data.select_dtypes(include=[np.number])
        corr_matrix = numerical_data.corr()

        self.to_drop = set()  # Reset in case fit() is called again

        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > self.threshold:
                    colname = corr_matrix.columns[i]
                    self.to_drop.add(colname)

        self.fitted = True
        return self

    def transform(self, data):
        """Drops columns identified in fit()."""
        if not self.fitted:
            raise RuntimeError("Call fit() before transform()!")
        
        cols_to_drop = list(self.to_drop & set(data.columns))
        return data.drop(columns=cols_to_drop, errors='ignore')

    def fit_transform(self, data):
        """Combines fit() and transform()."""
        self.fit(data)
        return self.transform(data)

    def get_columns_to_drop(self):
        """Returns the list of columns to be dropped."""
        if not self.fitted:
            raise RuntimeError("Call fit() first!")
        return list(self.to_drop)
    
class CategoryReducer:
    def __init__(self, category_columns, top_n=15):
        self.category_columns = category_columns
        self.top_n = top_n
        self.top_categories = None  # Will store the top categories from training

    def fit(self, data):
        # Identify and store the top N categories (only during training)
        self.top_categories = (
            data[self.category_columns]
            .sum()
            .sort_values(ascending=False)
            .head(self.top_n)
            .index.tolist()
        )
        return self  # For sklearn compatibility

    def transform(self, data):
        if self.top_categories is None:
            raise RuntimeError("Call fit() before transform()!")

        # Keep only the top categories (from training)
        df_top = data[self.top_categories].copy()

        # Sum remaining categories into "Other"
        other_columns = list(set(self.category_columns) - set(self.top_categories))
        df_top['Other'] = data[other_columns].sum(axis=1).clip(upper=1)  # Ensures 0 or 1

        # Drop original columns and concatenate reduced set
        data = data.drop(columns=self.category_columns)
        return pd.concat([data, df_top], axis=1)

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    # Handle pickle compatibility
    def __getstate__(self):
        return {k: v for k, v in self.__dict__.items() 
                if k in ['category_columns', 'top_n', 'top_categories',
                         'original_categories', 'expected_columns']}

    def __setstate__(self, state):
        self.__dict__.update(state)
        # Initialize missing attributes for old versions
        if not hasattr(self, 'original_categories'):
            self.original_categories = self.category_columns
        if not hasattr(self, 'expected_columns'):
            self.expected_columns = (self.top_categories + ['Other'] 
                                     if self.top_categories else None)
class AgeTransformer:
    def __init__(self, current_year=2025):
        self.current_year = current_year
        self.age_mode = None
        self.column_exists = True  # Track if column existed during training

    def fit(self, data):
        # Check if column exists in training data
        if 'Year Founded' not in data.columns:
            self.column_exists = False
            return self
            
        data = data.copy()
        data['Year Founded'] = pd.to_numeric(data['Year Founded'], errors='coerce')
        data['age'] = self.current_year - data['Year Founded']
        mode_series = data['age'].mode()
        self.age_mode = mode_series[0] if not mode_series.empty else 5
        return self

    def transform(self, data):
        """Handle missing 'Year Founded' column gracefully"""
        data = data.copy()
        
        # Create column if it doesn't exist
        if 'Year Founded' not in data.columns:
            if self.column_exists:
                # Column existed in training but missing in new data
                data['Year Founded'] = np.nan
            else:
                # Column never existed (new scenario)
                data['age'] = self.age_mode
                return data
                
        # Original processing if column exists
        data['Year Founded'] = pd.to_numeric(data['Year Founded'], errors='coerce')
        data['age'] = self.current_year - data['Year Founded']
        data.drop(columns=['Year Founded'], inplace=True, errors='ignore')
        data['age'].fillna(self.age_mode, inplace=True)
        return data

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)
class IPOAgeTransformer:
    def __init__(self, current_year=2025, unknown_placeholder="Unknown"):
        self.current_year = current_year
        self.unknown_placeholder = unknown_placeholder  # Replace NaN values

    def fit(self, data):
        """Stateless (no training needed). For pipeline compatibility."""
        return self

    def transform(self, data):
        """Computes IPO age and replaces missing values."""
        # if 'IPO' not in data.columns:
        #     raise ValueError("Column 'IPO' not found in data.")

        df = data.copy()
        df = ensure_ipo(df)
        df['IPO'] = pd.to_numeric(df['IPO'], errors='coerce')
        
        # Compute age (clamp negative values to 0)
        df['age IPO'] = (self.current_year - df['IPO']).clip(lower=0)
        
        # Replace missing ages with placeholder
        df['age IPO'] = df['age IPO'].replace(
            np.nan, self.unknown_placeholder
        )
        
        # Drop original IPO column
        df.drop(columns=['IPO'], inplace=True, errors='ignore')
        
        return df

    def fit_transform(self, data):
        return self.transform(data)  # fit() is stateless
    
class EmployeeDataCleaner:
    def __init__(self):
        self.employee_mode = None
        self.mean_without_zeros = None
        self.fitted = False  # Safety flag

    def fit(self, data):
        """Compute and store statistics from training data."""
        # Validate columns
        required_columns = [
            'Number of Employees (year of last update)',
            'Number of Employees'
        ]
        for col in required_columns:
            if col not in data.columns:
                raise ValueError(f"Column '{col}' not found in data.")

        # Compute mode for 'Number of Employees (year of last update)'
        mode_series = data['Number of Employees (year of last update)'].mode()
        self.employee_mode = mode_series[0] if not mode_series.empty else 0

        # Compute mean (excluding zeros/negatives) for 'Number of Employees'
        non_zero_employees = data.loc[
            data['Number of Employees'] > 0, 'Number of Employees'
        ]
        self.mean_without_zeros = non_zero_employees.mean()

        # Fallback if all values are zero/NaN
        if pd.isna(self.mean_without_zeros):
            self.mean_without_zeros = data['Number of Employees'].median()  # or a global default

        self.fitted = True
        return self

    def transform(self, data):
        """Apply cleaning using statistics from fit()."""
        if not self.fitted:
            raise RuntimeError("Call fit() before transform()!")

        df = data.copy()

        # Fill missing values with training mode
        if 'Number of Employees (year of last update)' in df.columns:
            df['Number of Employees (year of last update)'].fillna(
                self.employee_mode, inplace=True
            )

        # Handle nulls/negatives and replace zeros with training mean
        if 'Number of Employees' in df.columns:
            df['Number of Employees'] = np.where(
                df['Number of Employees'].isna() | (df['Number of Employees'] < 0),
                0,
                df['Number of Employees']
            )
            df['Number of Employees'] = df['Number of Employees'].replace(
                0, self.mean_without_zeros
            )

        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)

class BoardMembersTransformer:
    def __init__(self, min_count=5):
        self.min_count = min_count  # Keep only members appearing ≥ min_count times
        self.common_members = None  # Stores frequent members from training
        self.member_counts_ = None  # Optional: track raw counts

    def fit(self, data):
        """Identify frequently occurring board members from training data."""
        if 'Board Members' not in data.columns:
            raise ValueError("Column 'Board Members' not found in data.")

        all_members = []
        for cell in data['Board Members'].dropna():
            members = [name.strip() for name in str(cell).split(',')]
            all_members.extend(members)

        # Count occurrences and filter by min_count
        self.member_counts_ = Counter(all_members)
        self.common_members = {
            name for name, count in self.member_counts_.items() 
            if count >= self.min_count
        }
        return self

    def transform(self, data):
        """Convert board members into binary features for common members."""
        if self.common_members is None:
            raise RuntimeError("Call fit() before transform()!")

        df = data.copy()
        
        # Create binary columns for each common member
        for member in self.common_members:
            df[f'Board Member: {member}'] = df['Board Members'].apply(
                lambda x: 1 if pd.notna(x) and member in str(x) else 0
            )

        # Optional: Add a summary feature (total members or binary "has members")
        df['Has Board Members'] = df['Board Members'].notna().astype(int)
        
        # Drop original column
        df.drop(columns=['Board Members'], inplace=True, errors='ignore')
        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)

class BoardMembersTransformer:
    def __init__(self, min_count=5):
        self.min_count = min_count  # Keep only members appearing ≥ min_count times
        self.common_members = None  # Stores frequent members from training
        self.member_counts_ = None  # Optional: track raw counts

    def fit(self, data):
        """Identify frequently occurring board members from training data."""
        if 'Board Members' not in data.columns:
            raise ValueError("Column 'Board Members' not found in data.")

        all_members = []
        for cell in data['Board Members'].dropna():
            members = [name.strip() for name in str(cell).split(',')]
            all_members.extend(members)

        # Count occurrences and filter by min_count
        self.member_counts_ = Counter(all_members)
        self.common_members = {
            name for name, count in self.member_counts_.items() 
            if count >= self.min_count
        }
        return self

    def transform(self, data):
        """Convert board members into binary features for common members."""
        if self.common_members is None:
            raise RuntimeError("Call fit() before transform()!")

        df = data.copy()
        
        # Create binary columns for each common member
        for member in self.common_members:
            df[f'Board Member: {member}'] = df['Board Members'].apply(
                lambda x: 1 if pd.notna(x) and member in str(x) else 0
            )

        # Optional: Add a summary feature (total members or binary "has members")
        df['Has Board Members'] = df['Board Members'].notna().astype(int)
        
        # Drop original column
        df.drop(columns=['Board Members'], inplace=True, errors='ignore')
        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)

class FoundersTransformer:
    def __init__(self, min_count=3):
        self.min_count = min_count  # Keep founders appearing ≥ min_count times
        self.common_founders = None  # Stores frequent founders from training
        self.founder_counts_ = None  # Optional: track raw counts

    def fit(self, data):
        """Identify frequently occurring founders from training data."""
        if 'Founders' not in data.columns:
            raise ValueError("Column 'Founders' not found in data.")

        all_founders = []
        for cell in data['Founders'].dropna():
            founders = [name.strip() for name in str(cell).split(',')]
            all_founders.extend(founders)

        # Count occurrences and filter by min_count
        self.founder_counts_ = Counter(all_founders)
        self.common_founders = {
            name for name, count in self.founder_counts_.items() 
            if count >= self.min_count
        }
        return self

    def transform(self, data):
        """Convert founders into binary features for common founders."""
        if self.common_founders is None:
            raise RuntimeError("Call fit() before transform()!")

        df = data.copy()
        
        # Create binary columns for each common founder
        for founder in self.common_founders:
            df[f'Founder: {founder}'] = df['Founders'].apply(
                lambda x: 1 if pd.notna(x) and founder in str(x) else 0
            )

        # Optional: Add a summary feature
        df['Has Founders'] = df['Founders'].notna().astype(int)
        
        # Drop original column
        df.drop(columns=['Founders'], inplace=True, errors='ignore')
        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)

class TaglineCategoryGuesser:

    def __init__(self):
        self.category_keywords = {'Artificial Intelligence': ['ai', 'machine learning', 'deep learning', 'neural network'], 'Mobile': ['mobile', 'android', 'ios', 'app store', 'smartphone'], 'E-Commerce': ['ecommerce', 'e-commerce', 'shopping', 'online store'], 'FinTech': ['finance', 'banking', 'payments', 'fintech', 'crypto', 'blockchain'], 'Healthcare': ['health', 'medical', 'hospital', 'doctor', 'pharma'], 'Social Media': ['social network', 'community', 'messaging', 'chat'], 'Gaming': ['game', 'gaming', 'video game', 'esports'], 'Cloud': ['cloud', 'saas', 'paas', 'infrastructure'], 'EdTech': ['education', 'learning', 'students', 'teaching', 'school'], 'Data Analytics': ['analytics', 'data science', 'big data', 'insights']}

    def guess_category_from_tagline(self, tagline):
        tagline = str(tagline).lower()
        matched = [cat for (cat, keywords) in self.category_keywords.items() if any((keyword in tagline for keyword in keywords))]
        if len(matched) == 0:
            matched = ['Software', 'Advertising']
        elif len(matched) == 1:
            matched.append('Software')
        return ', '.join(matched)

    def fit(self, data):
        return self

    def transform(self, data):
        df = data.copy()
        df['Tagline'] = df['Tagline'].fillna('')
        df['Market Categories'] = df['Market Categories'].fillna('Unknown')
        df['Market Categories'] = df.apply(lambda row: self.guess_category_from_tagline(row['Tagline']) if str(row['Market Categories']).strip().lower() in ['unknown', 'nan', 'none', ''] else row['Market Categories'], axis=1)
        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)
class MarketCategoryGeneralizer:

    def __init__(self):
        self.category_mapping = {'Software': 'Technology & Software', 'Advertising': 'Advertising & Marketing', 'E-Commerce': 'E-Commerce & Online Services', 'Mobile': 'Mobile & Consumer Electronics', 'Games': 'Games & Entertainment', 'Social Media': 'Social Networking & Communication', 'Cloud': 'Technology & Software', 'Finance': 'Finance & Payments', 'Healthcare': 'Healthcare & Wellness', 'Semiconductors': 'Technology Hardware', 'Data Analytics': 'Analytics & Data Science', 'Search': 'Advertising & Marketing', 'Video': 'Games & Entertainment', 'Networking': 'Telecom & Networks', 'Messaging': 'Social Networking & Communication', 'Education': 'Education & Learning', 'News': 'Media & News', 'Photo Sharing': 'Digital Media & Content', 'Mobile Payments': 'Finance & Payments', 'Robotics': 'Games & Entertainment', 'Music': 'Games & Entertainment', 'Photo Editing': 'Digital Media & Content', 'Online Rental': 'E-Commerce & Online Services', 'Location Based Services': 'Telecom & Networks', 'Enterprise Software': 'Technology & Software', 'Video Streaming': 'Games & Entertainment', 'PaaS': 'Technology & Software', 'SaaS': 'Technology & Software', 'Health and Wellness': 'Healthcare & Wellness', 'Web Hosting': 'Technology & Software', 'Internet of Things': 'IoT (Internet of Things)', 'Cloud Security': 'Technology & Software', 'Virtual Currency': 'Finance & Payments', 'Search Marketing': 'Advertising & Marketing', 'Mobile Social': 'Social Networking & Communication', 'Retail': 'Retail & Fashion', 'Consulting': 'Others & Miscellaneous', 'Aerospace': 'Others & Miscellaneous', 'Food Delivery': 'Consumer Goods & Services', 'Fashion': 'Retail & Fashion', 'Wine And Spirits': 'Consumer Goods & Services', 'Streaming': 'Games & Entertainment', 'Task Management': 'Others & Miscellaneous', 'Video Chat': 'Social Networking & Communication', 'Personalization': 'Advertising & Marketing', 'Shopping': 'E-Commerce & Online Services', 'Local': 'E-Commerce & Online Services', 'News': 'Media & News', 'Fraud Detection': 'Advertising & Marketing', 'Image Recognition': 'Technology Hardware', 'Virtualization': 'Games & Entertainment', 'Analytics': 'Analytics & Data Science', 'Video on Demand': 'Games & Entertainment', 'Mobile Payments': 'Finance & Payments', 'Marketing Automation': 'Advertising & Marketing', 'Consumer Electronics': 'Mobile & Consumer Electronics', 'Video Games': 'Games & Entertainment', 'Public Relations': 'Advertising & Marketing'}

    def map_categories(self, row):
        categories = str(row).split(',')
        generalized = []
        for cat in categories:
            cat = cat.strip()
            if cat in self.category_mapping:
                generalized.append(self.category_mapping[cat])
            else:
                generalized.append('Others & Miscellaneous')
        return ', '.join(set(generalized))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['Generalized Market Categories'] = df['Market Categories'].fillna('').apply(self.map_categories)
        return df

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
class CountryRegionFiller:

    def __init__(self):
        self.countries = [country.name for country in pycountry.countries]
        self.regions = ['California', 'New York', 'Texas', 'Basel', 'Utah', 'Île-de-France', 'Bavaria', 'Ontario', 'Switzerland', 'United States', 'France', 'Great Britain', 'Israel', 'Sweden', 'Canada', 'Germany', 'Japan', 'India', 'Denmark', 'China', 'Spain', 'Netherlands', 'Finland', 'Australia', 'Ireland', 'United Stats of AMerica', 'United Arab Emirates', 'Quebec']

    def find_place(self, text, place_list):
        for place in place_list:
            if re.search('\\b' + re.escape(place) + '\\b', str(text)):
                return place
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        for (idx, row) in df[df['Country (HQ)'].isnull() | df['State / Region (HQ)'].isnull()].iterrows():
            desc = row['Description']
            if pd.isnull(desc):
                continue
            country = self.find_place(desc, self.countries)
            region = self.find_place(desc, self.regions)
            if pd.isnull(row['Country (HQ)']) and country:
                df.at[idx, 'Country (HQ)'] = country
            if pd.isnull(row['State / Region (HQ)']) and region:
                df.at[idx, 'State / Region (HQ)'] = region
        return df

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
class CategoricalFillerAndEncoder:

    def __init__(self, columns):
        self.columns = columns
        self.modes = {}
        self.label_encoders = {}
        self.label_maps = {}

    def fit(self, X, y=None):
        for col in self.columns:
            mode_val = X[col].mode()[0]
            self.modes[col] = mode_val
            le = LabelEncoder()
            filled = X[col].fillna(mode_val).astype(str)
            le.fit(filled)
            self.label_encoders[col] = le
            self.label_maps[col] = {label: i for (i, label) in enumerate(le.classes_)}
        return self

    def transform(self, X):
        df = X.copy()
        for col in self.columns:
            mode_val = self.modes[col]
            label_map = self.label_maps[col]
            df[col] = df[col].fillna(mode_val).astype(str)
            df[col + '_LabelEncoded'] = df[col].map(lambda x: label_map.get(x, -1))
        return df

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
class CustomEncoder(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.status_cols_ = None
        self.terms_cols_ = None

    def fit(self, df):
        self.status_cols_ = df['Status'].unique()
        self.terms_cols_ = df['Terms'].unique()
        return self

    def transform(self, df):
        df = df.copy()
        df = pd.get_dummies(df, columns=['Status'], drop_first=False)
        df = pd.get_dummies(df, columns=['Terms'], drop_first=False)
        if 'Terms_Cash, Stock' in df.columns:
            cash_stock_mask = df['Terms_Cash, Stock'] == 1
            df.loc[cash_stock_mask, 'Terms_Cash'] = 1
            df.loc[cash_stock_mask, 'Terms_Stock'] = 1
            df = df.drop('Terms_Cash, Stock', axis=1)
        expected_cols = [f'Status_{s}' for s in self.status_cols_] + [f'Terms_{t}' for t in self.terms_cols_ if t != 'Cash, Stock']
        for col in expected_cols:
            if col not in df.columns:
                df[col] = 0
        return df[expected_cols]

In [3]:


def predict_new_data(acquiring_path, acquired_path, acquisitions_path, output_path):
    """
    Process new data and make predictions using saved models
    
    Args:
        acquiring_path: Path to new acquiring companies CSV
        acquired_path: Path to new acquired companies CSV  
        acquisitions_path: Path to new acquisitions CSV
        output_path: Where to save predictions
    """
    
    # ===================================
    # Load All Saved Preprocessing Objects
    # ===================================
    
    # Load acquiring company transformers
    with open('mlb_acquiring.pkl', 'rb') as f:
        mlb_acquiring = pickle.load(f)
    
    # Load the saved filter
    with open("correlation_filter.pkl", "rb") as f:
        loaded_filter = pickle.load(f)
    
    #
    with open("category_reducer_acquiring.pkl", "rb") as f:
        loaded_reducer = pickle.load(f)

    #
    with open('Age_column_acquiring.pkl', 'rb') as f:
        age_mode_col =  pickle.load(f)
    #
    with open("ipo_transformer_acquiring.pkl", "rb") as f:
        loaded_ipo_transformer = pickle.load(f)
    #
    with open("employee_cleaner_acquiring.pkl", "rb") as f:
        loaded_employee_cleaner = pickle.load(f)

    # Load the saved transformer,
    with open("board_members_transformer.pkl", "rb") as f:
        loaded_transformer = pickle.load(f)

    # Load the saved transformer
    with open("founders_transformer.pkl", "rb") as f:
        loaded_transformer = pickle.load(f)

    
    
    with open('tfidf_acquiring.pkl', 'rb') as f:
        tfidf_acquiring = pickle.load(f)
        
    #
    with open("tagline_guesser_acquired.pkl", "rb") as f:
        loaded_guesser = pickle.load(f)

    #
    with open("category_generalizer_acquired.pkl", "rb") as f:
        loaded_generalizer = pickle.load(f)

    #
    with open("country_region_filler_acquired.pkl", "rb") as f:
        loaded_filler = pickle.load(f)
    
    #
    with open("categorical_encoder_acquired.pkl", "rb") as f:
        encoder = pickle.load(f)

    # Load acquired company transformers
    with open('mlb_acquired.pkl', 'rb') as f:
        mlb_acquired = pickle.load(f)
    # with open('label_encoders_acquired.pkl', 'rb') as f:
    #     label_encoders_acquired = pickle.load(f)
        
    # Load acquisitions transformers
    # with open('ohe_acquisitions.pkl', 'rb') as f:
    #     ohe_acquisitions = pickle.load(f)
        
    # Load final preprocessing objects
    # with open('final_imputer.pkl', 'rb') as f:
    #     final_imputer = pickle.load(f)
    with open('final_scaler.pkl', 'rb') as f:
        final_scaler = pickle.load(f)
    with open('final_pca.pkl', 'rb') as f:
        final_pca = pickle.load(f)
    with open('target_encoder.pkl', 'rb') as f:
        target_encoder = pickle.load(f)
    

# Load saved preprocessing objects
    
    # Load model
    # with open('final_model.pkl', 'rb') as f:
    #     model = pickle.load(f)
    
    # ==============================
    # Preprocess New Data (Same as Training)
    # ==============================
    
    # Process each dataset with saved transformers
    def process_acquiring_new(data):
        """Process new acquiring data with saved transformers"""
        data = data.copy()
        
        # Apply same cleaning as training
        data.drop(['CrunchBase Profile','Image','Homepage','Twitter','API'], 
                 axis=1, inplace=True, errors='ignore')
                
        data['Number of Employees'] = data['Number of Employees'].replace({',': ''}, regex=True)
        data['Number of Employees'] = data['Number of Employees'].fillna(0).astype(int)
        for col in data.columns:
            if data[col].dtype == 'object':
                try:
                    data[col] = data[col].astype(int)
                except:
                    pass  # ignore if it fails, i.e., for non-numeric text
        # Use mean from training (would need to save this)
        # data['Number of Employees'] = data['Number of Employees'].replace(
        #     0, 450)  # Replace with saved mean from training
        
        # Handle IPO status
        data['IPO'] = data['IPO'].replace("Not yet", np.nan)
        data['Is_Public'] = data['IPO'].notna().astype(int)
        data.drop_duplicates(inplace=True)
        data.drop('IPO', axis=1, inplace=True)
        
        # Process Market Categories with saved MLB
        
    

        # instantiate and fit once

        # 1. build the list column so it’s available later
        data['Market Categories List'] = (
            data['Market Categories']
                .fillna('')          # protect against NaNs
                .str.split(',')      # convert to Python list
        )

        # 2. create the one-hot columns from that list column
        category_dummies = pd.DataFrame(
            mlb_acquiring.transform(data['Market Categories List']),
            columns=mlb_acquiring.classes_,
         index=data.index
     )
        data = pd.concat([data, category_dummies], axis=1)
        data.drop(columns=['Market Categories', 'Market Categories List'], inplace=True)

        # Option 1: Apply transform() directly (drops columns)
        data = loaded_filter.transform(data)
    
        # Option 2: Manually drop columns (if needed)
        columns_to_drop_test = loaded_filter.get_columns_to_drop()
        data = data.drop(columns=columns_to_drop_test, errors='ignore')

        # Use the loaded reducer on new data
        missing_original = set(loaded_reducer.original_categories) - set(data.columns)
        for col in missing_original:
            data[col] = 0
        
        # Now apply the reducer
        data = loaded_reducer.transform(data)
        
        #
        data = age_mode_col.transform(data)

        #
        data = loaded_ipo_transformer.transform(data)
        
        data.drop(columns=['Address (HQ)'], inplace=True)
        #
        data = loaded_employee_cleaner.transform(data)
        # data.drop(columns=['Founders'], inplace=True)

        # Process text with saved TF-IDF
        data['Text_Combined'] = data['Tagline'].fillna('') + ' ' + data['Description'].fillna('')
        def clean_text(text):
            text = text.lower()
            text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
            text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
            return text

        data['Text_Combined'] = data['Text_Combined'].apply(clean_text)
        tfidf_features = tfidf_acquiring.transform(data['Text_Combined'])
        tfidf_df = pd.DataFrame(
            tfidf_features.toarray(), 
            columns=tfidf_acquiring.get_feature_names_out(),
            index=data.index
        )
        
        # Final cleanup
        data = pd.concat([data, category_dummies, tfidf_df], axis=1)

        cols_to_drop = ['Market Categories', 'Tagline', 'Description', 'Text_Combined',
                'Address (HQ)', 'Board Members', 'Founders']

        existing_cols = [col for col in cols_to_drop if col in data.columns]
        data.drop(existing_cols, axis=1, inplace=True)

        
        return data
    
    # Similar functions for acquired and acquisitions...
    # (Implementation would mirror the training preprocessing but using saved transformers)
    

    def process_acquisitions_new(data):
        """Process new acquisitions data with saved transformers"""
        data = data.copy()
        
        
        # Drop columns same as training
        data.drop(columns=["Acquisition Profile", "News", "News Link"], 
                inplace=True, errors='ignore')
        
        with open('custom_acquisitions_encoder.pkl', 'rb') as f:
            encoder = pickle.load(f)
        with open('mode_acquisitions_imputer.pkl', 'rb') as f:
            modes = pickle.load(f)

        # Handle missing values with saved modes
        for col, mode_val in modes.items():
            if col in data.columns and mode_val is not None:
                data[col].fillna(mode_val, inplace=True)
        
        # Convert date features same as training
        if 'Deal announced on' in data.columns:
            data['Deal_date'] = pd.to_datetime(data['Deal announced on'], dayfirst=True, errors='coerce')
            data['Deal_day'] = data['Deal_date'].dt.day
            data['Deal_month'] = data['Deal_date'].dt.month
            data['Deal_dayofweek'] = data['Deal_date'].dt.dayofweek
            data.drop(['Deal announced on', 'Deal_date'], axis=1, inplace=True)
        
        # Apply saved one-hot encoding
        encoded_status_terms = encoder.transform(data)
        data = pd.concat([data, encoded_status_terms], axis=1)
        
        return data

    def process_acquired_new(data):
        """Process new acquiring data with saved transformers"""
        data = data.copy()

        data = loaded_guesser.transform(data)

        data = loaded_generalizer.transform(data)

        data = loaded_filler.transform(data)

        data = encoder.transform(data)
        
        data['Acquired by'].fillna('Salesforce', inplace=True)


        columns_to_drop = ['Image', 'CrunchBase Profile', 'Homepage', 'Twitter','Address (HQ)','API','Description',
                           'Tagline','Market Categories','City (HQ)', 'State / Region (HQ)', 'Country (HQ)','Generalized Market Categories','Year Founded']
        data.drop(columns=columns_to_drop, inplace=True)
        
        return data 
    
    
    
    # Preprocess each new dataset
    acquiring_new = process_acquiring_new(pd.read_csv(acquiring_path))
    acquired_new = process_acquired_new(pd.read_csv(acquired_path)) 
    acquisitions_new = process_acquisitions_new(pd.read_csv(acquisitions_path))
    
    def find_company_column(df):
        for col in df.columns:
            if "company" in col.lower():
                return col
        raise ValueError("No company name column found.")

    # Start with acquisition data
    final_df = acquisitions_new.copy()

    # Mapping for left_on column and its corresponding dataset
    merge_targets = [
        ('Acquired Company', acquired_new, '_Acquired'),
        ('Acquiring Company', acquiring_new, '_Acquiring')
    ]

    # Perform merges in loop
    for left_key, company_data, suffix in merge_targets:
        company_col = find_company_column(company_data)

        # Strip and lower case the company names for matching
        final_df[left_key] = final_df[left_key].str.strip().str.lower()
        company_data[company_col] = company_data[company_col].str.strip().str.lower()

        # Merge
        final_df = final_df.merge(
            company_data,
            how='left',
            left_on=left_key,
            right_on=company_col,
            suffixes=('', suffix)
        )

        # Drop the extra company column if you want
        final_df.drop(columns=[company_col], inplace=True, errors='ignore')



    # Done!
    print(final_df.head())
    
    
    # Handle missing values with saved imputer
    # final_new_imputed = pd.DataFrame(
    #     final_imputer.transform(final_df),
    #     columns=final_new.columns
    # )

    with open('mode_acquisitions_imputer.pkl', 'rb') as f:
        modes = pickle.load(f)

    for col, mode_val in modes.items():
       # if col in df.columns and mode_val is not None:
           final_df[col].fillna(mode_val, inplace=True)
    
    
    # Prepare features
    # X_new = final_df.drop(
    #     ['Deal size class', 'Acquired Company', 'Acquiring Company'], 
    #     axis=1, errors='ignore'
    # )
    with open('selected_features.pkl', 'rb') as f:
        selected_features = pickle.load(f)

    # Filter the test data to keep only those features
    X_new = final_df[selected_features]
    # Apply same scaling and PCA as training
    X_new_scaled = final_scaler.transform(X_new)
    X_new_pca = final_pca.transform(X_new_scaled)
    
    # Make predictions
    predictions_encoded = model.predict(X_new_pca)
    predictions = target_encoder.inverse_transform(predictions_encoded)
    
    # Save predictions with original data
    final_new['Predicted_Deal_Size'] = predictions
    final_new.to_csv(output_path, index=False)
    
    print(f"Predictions saved to {output_path}")

if __name__ == "__main__":
    predict_new_data(
        acquiring_path="new_acquiring.csv",
        acquired_path="new_acquired.csv",
        acquisitions_path="new_acquisitions.csv",
        output_path="new_predictions.csv"
    )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['age'].fillna(self.age_mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Number of Employees (year of last update)'].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

                       Acquisitions ID Acquired Company  \
0     EMC acquired Data Domain in 2009      data domain   
1           AOL acquired Quigo in 2007            quigo   
2      Cisco acquired PostPath in 2008         postpath   
3  Oracle acquired BigMachines in 2013      bigmachines   
4      Yahoo! acquired Snip.it in 2013          snip.it   

   Year of acquisition announcement Deal size class       Status        Terms  \
0                              2009           Large  Undisclosed         Cash   
1                              2007          Medium  Undisclosed         Cash   
2                              2008          Medium  Undisclosed  Undisclosed   
3                              2013          Medium  Undisclosed  Undisclosed   
4                              2013           Small  Undisclosed  Cash, Stock   

   Deal_day  Deal_month  Deal_dayofweek  Status_Undisclosed  ...       web  \
0       8.0         7.0             2.0                True  ...  0.000000   
1 

KeyError: 'City (HQ)_Acquiring'

In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox

def browse_file(entry_widget):
    directory = filedialog.askopenfilename()
    if directory:
        entry_widget.delete(0, tk.END)
        entry_widget.insert(0, directory)

def predict_data():
    # Get input values from entries
    paths = [
        entry1.get(),
        entry2.get(),
        entry3.get()
    ]
    
    # Validate paths
    for path in paths:
        if not path.strip():
            messagebox.showerror("Error", "All paths must be selected!")
            return
    
    # Call your prediction function
    try:
        predict_new_data(*paths, "new_predictions.csv")
        messagebox.showinfo("Success", "Prediction completed successfully!")
    except Exception as e:
        messagebox.showerror("Error", f"Prediction failed: {str(e)}")

# Create main window
root = tk.Tk()
root.title("Predictor")
root.geometry("1200x800")

# Function to create consistent input fields with browse buttons
def create_file_input(row, label_text):
    # Label
    tk.Label(root, text=label_text).grid(row=row, column=0, padx=10, pady=5, sticky=tk.W)
    
    # Entry field
    entry = tk.Entry(root, width=80)
    entry.grid(row=row, column=1, padx=10, pady=5)
    
    # Browse button
    browse_btn = tk.Button(root, text="Browse", 
                          command=lambda: browse_file(entry))
    browse_btn.grid(row=row, column=2, padx=10, pady=5)
    
    return entry

# Create input fields with browse buttons
entry1 = create_file_input(0, "Acquiring Path:")
entry2 = create_file_input(1, "Acquired Path:")
entry3 = create_file_input(2, "Acquisitions Path:")

# Prediction button
predict_button = tk.Button(root, text="Predict", command=predict_data,
                          bg="#4CAF50", fg="white", font=("Arial", 12))
predict_button.grid(row=3, column=0, columnspan=3, pady=20, ipadx=10, ipady=5)

# Configure grid layout
root.grid_columnconfigure(1, weight=1)  # Make entry fields expandable

# Start the GUI
root.mainloop()