In [2]:
# ======================== IMPORTS ========================
from collections import Counter
from joblib import dump
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.svm import SVC
from sklearn.svm import SVR
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import pycountry
import re
import seaborn as sns


COLUMN_ALIASES = {
    'IPO Date': 'IPO',
    'Went Public': 'IPO',
    'Public Listing': 'IPO'
}

def ensure_ipo(df):
    df.rename(columns=COLUMN_ALIASES, inplace=True)
    if 'IPO' not in df.columns:
        df['IPO'] = np.nan
    return df
# ======================== CLASSES ========================
class CorrelationFilter:
    def __init__(self, threshold=0.85):
        self.threshold = threshold
        self.to_drop = set()  # Stores columns to drop
        self.fitted = False   # Tracks if fit() was called

    def fit(self, data):
        """Identifies highly correlated columns to drop."""
        numerical_data = data.select_dtypes(include=[np.number])
        corr_matrix = numerical_data.corr()

        self.to_drop = set()  # Reset in case fit() is called again

        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > self.threshold:
                    colname = corr_matrix.columns[i]
                    self.to_drop.add(colname)

        self.fitted = True
        return self

    def transform(self, data):
        """Drops columns identified in fit()."""
        if not self.fitted:
            raise RuntimeError("Call fit() before transform()!")
        
        cols_to_drop = list(self.to_drop & set(data.columns))
        return data.drop(columns=cols_to_drop, errors='ignore')

    def fit_transform(self, data):
        """Combines fit() and transform()."""
        self.fit(data)
        return self.transform(data)

    def get_columns_to_drop(self):
        """Returns the list of columns to be dropped."""
        if not self.fitted:
            raise RuntimeError("Call fit() first!")
        return list(self.to_drop)
    
class CategoryReducer:
    def __init__(self, category_columns, top_n=15):
        self.category_columns = category_columns
        self.top_n = top_n
        self.top_categories = None  # Will store the top categories from training

    def fit(self, data):
        # Identify and store the top N categories (only during training)
        self.top_categories = (
            data[self.category_columns]
            .sum()
            .sort_values(ascending=False)
            .head(self.top_n)
            .index.tolist()
        )
        return self  # For sklearn compatibility

    def transform(self, data):
        if self.top_categories is None:
            raise RuntimeError("Call fit() before transform()!")

        # Keep only the top categories (from training)
        df_top = data[self.top_categories].copy()

        # Sum remaining categories into "Other"
        other_columns = list(set(self.category_columns) - set(self.top_categories))
        df_top['Other'] = data[other_columns].sum(axis=1).clip(upper=1)  # Ensures 0 or 1

        # Drop original columns and concatenate reduced set
        data = data.drop(columns=self.category_columns)
        return pd.concat([data, df_top], axis=1)

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    # Handle pickle compatibility
    def __getstate__(self):
        return {k: v for k, v in self.__dict__.items() 
                if k in ['category_columns', 'top_n', 'top_categories',
                         'original_categories', 'expected_columns']}

    def __setstate__(self, state):
        self.__dict__.update(state)
        # Initialize missing attributes for old versions
        if not hasattr(self, 'original_categories'):
            self.original_categories = self.category_columns
        if not hasattr(self, 'expected_columns'):
            self.expected_columns = (self.top_categories + ['Other'] 
                                     if self.top_categories else None)
class AgeTransformer:
    def __init__(self, current_year=2025):
        self.current_year = current_year
        self.age_mode = None
        self.column_exists = True  # Track if column existed during training

    def fit(self, data):
        # Check if column exists in training data
        if 'Year Founded' not in data.columns:
            self.column_exists = False
            return self
            
        data = data.copy()
        data['Year Founded'] = pd.to_numeric(data['Year Founded'], errors='coerce')
        data['age'] = self.current_year - data['Year Founded']
        mode_series = data['age'].mode()
        self.age_mode = mode_series[0] if not mode_series.empty else 5
        return self

    def transform(self, data):
        """Handle missing 'Year Founded' column gracefully"""
        data = data.copy()
        
        # Create column if it doesn't exist
        if 'Year Founded' not in data.columns:
            if self.column_exists:
                # Column existed in training but missing in new data
                data['Year Founded'] = np.nan
            else:
                # Column never existed (new scenario)
                data['age'] = self.age_mode
                return data
                
        # Original processing if column exists
        data['Year Founded'] = pd.to_numeric(data['Year Founded'], errors='coerce')
        data['age'] = self.current_year - data['Year Founded']
        data.drop(columns=['Year Founded'], inplace=True, errors='ignore')
        data['age'].fillna(self.age_mode, inplace=True)
        return data

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)
class IPOAgeTransformer:
    def __init__(self, current_year=2025, unknown_placeholder="Unknown"):
        self.current_year = current_year
        self.unknown_placeholder = unknown_placeholder  # Replace NaN values

    def fit(self, data):
        """Stateless (no training needed). For pipeline compatibility."""
        return self

    def transform(self, data):
        """Computes IPO age and replaces missing values."""
        # if 'IPO' not in data.columns:
        #     raise ValueError("Column 'IPO' not found in data.")

        df = data.copy()
        df = ensure_ipo(df)
        df['IPO'] = pd.to_numeric(df['IPO'], errors='coerce')
        
        # Compute age (clamp negative values to 0)
        df['age IPO'] = (self.current_year - df['IPO']).clip(lower=0)
        
        # Replace missing ages with placeholder
        df['age IPO'] = df['age IPO'].replace(
            np.nan, self.unknown_placeholder
        )
        
        # Drop original IPO column
        df.drop(columns=['IPO'], inplace=True, errors='ignore')
        
        return df

    def fit_transform(self, data):
        return self.transform(data)  # fit() is stateless
    
class EmployeeDataCleaner:
    def __init__(self):
        self.employee_mode = None
        self.mean_without_zeros = None
        self.fitted = False  # Safety flag

    def fit(self, data):
        """Compute and store statistics from training data."""
        # Validate columns
        required_columns = [
            'Number of Employees (year of last update)',
            'Number of Employees'
        ]
        for col in required_columns:
            if col not in data.columns:
                raise ValueError(f"Column '{col}' not found in data.")

        # Compute mode for 'Number of Employees (year of last update)'
        mode_series = data['Number of Employees (year of last update)'].mode()
        self.employee_mode = mode_series[0] if not mode_series.empty else 0

        # Compute mean (excluding zeros/negatives) for 'Number of Employees'
        non_zero_employees = data.loc[
            data['Number of Employees'] > 0, 'Number of Employees'
        ]
        self.mean_without_zeros = non_zero_employees.mean()

        # Fallback if all values are zero/NaN
        if pd.isna(self.mean_without_zeros):
            self.mean_without_zeros = data['Number of Employees'].median()  # or a global default

        self.fitted = True
        return self

    def transform(self, data):
        """Apply cleaning using statistics from fit()."""
        if not self.fitted:
            raise RuntimeError("Call fit() before transform()!")

        df = data.copy()

        # Fill missing values with training mode
        if 'Number of Employees (year of last update)' in df.columns:
            df['Number of Employees (year of last update)'].fillna(
                self.employee_mode, inplace=True
            )

        # Handle nulls/negatives and replace zeros with training mean
        if 'Number of Employees' in df.columns:
            df['Number of Employees'] = np.where(
                df['Number of Employees'].isna() | (df['Number of Employees'] < 0),
                0,
                df['Number of Employees']
            )
            df['Number of Employees'] = df['Number of Employees'].replace(
                0, self.mean_without_zeros
            )

        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)

class BoardMembersTransformer:
    def __init__(self, min_count=5):
        self.min_count = min_count  # Keep only members appearing ≥ min_count times
        self.common_members = None  # Stores frequent members from training
        self.member_counts_ = None  # Optional: track raw counts

    def fit(self, data):
        """Identify frequently occurring board members from training data."""
        if 'Board Members' not in data.columns:
            raise ValueError("Column 'Board Members' not found in data.")

        all_members = []
        for cell in data['Board Members'].dropna():
            members = [name.strip() for name in str(cell).split(',')]
            all_members.extend(members)

        # Count occurrences and filter by min_count
        self.member_counts_ = Counter(all_members)
        self.common_members = {
            name for name, count in self.member_counts_.items() 
            if count >= self.min_count
        }
        return self

    def transform(self, data):
        """Convert board members into binary features for common members."""
        if self.common_members is None:
            raise RuntimeError("Call fit() before transform()!")

        df = data.copy()
        
        # Create binary columns for each common member
        for member in self.common_members:
            df[f'Board Member: {member}'] = df['Board Members'].apply(
                lambda x: 1 if pd.notna(x) and member in str(x) else 0
            )

        # Optional: Add a summary feature (total members or binary "has members")
        df['Has Board Members'] = df['Board Members'].notna().astype(int)
        
        # Drop original column
        df.drop(columns=['Board Members'], inplace=True, errors='ignore')
        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)

class BoardMembersTransformer:
    def __init__(self, min_count=5):
        self.min_count = min_count  # Keep only members appearing ≥ min_count times
        self.common_members = None  # Stores frequent members from training
        self.member_counts_ = None  # Optional: track raw counts

    def fit(self, data):
        """Identify frequently occurring board members from training data."""
        if 'Board Members' not in data.columns:
            raise ValueError("Column 'Board Members' not found in data.")

        all_members = []
        for cell in data['Board Members'].dropna():
            members = [name.strip() for name in str(cell).split(',')]
            all_members.extend(members)

        # Count occurrences and filter by min_count
        self.member_counts_ = Counter(all_members)
        self.common_members = {
            name for name, count in self.member_counts_.items() 
            if count >= self.min_count
        }
        return self

    def transform(self, data):
        """Convert board members into binary features for common members."""
        if self.common_members is None:
            raise RuntimeError("Call fit() before transform()!")

        df = data.copy()
        
        # Create binary columns for each common member
        for member in self.common_members:
            df[f'Board Member: {member}'] = df['Board Members'].apply(
                lambda x: 1 if pd.notna(x) and member in str(x) else 0
            )

        # Optional: Add a summary feature (total members or binary "has members")
        df['Has Board Members'] = df['Board Members'].notna().astype(int)
        
        # Drop original column
        df.drop(columns=['Board Members'], inplace=True, errors='ignore')
        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)

class FoundersTransformer:
    def __init__(self, min_count=3):
        self.min_count = min_count  # Keep founders appearing ≥ min_count times
        self.common_founders = None  # Stores frequent founders from training
        self.founder_counts_ = None  # Optional: track raw counts

    def fit(self, data):
        """Identify frequently occurring founders from training data."""
        if 'Founders' not in data.columns:
            raise ValueError("Column 'Founders' not found in data.")

        all_founders = []
        for cell in data['Founders'].dropna():
            founders = [name.strip() for name in str(cell).split(',')]
            all_founders.extend(founders)

        # Count occurrences and filter by min_count
        self.founder_counts_ = Counter(all_founders)
        self.common_founders = {
            name for name, count in self.founder_counts_.items() 
            if count >= self.min_count
        }
        return self

    def transform(self, data):
        """Convert founders into binary features for common founders."""
        if self.common_founders is None:
            raise RuntimeError("Call fit() before transform()!")

        df = data.copy()
        
        # Create binary columns for each common founder
        for founder in self.common_founders:
            df[f'Founder: {founder}'] = df['Founders'].apply(
                lambda x: 1 if pd.notna(x) and founder in str(x) else 0
            )

        # Optional: Add a summary feature
        df['Has Founders'] = df['Founders'].notna().astype(int)
        
        # Drop original column
        df.drop(columns=['Founders'], inplace=True, errors='ignore')
        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)

class TaglineCategoryGuesser:

    def __init__(self):
        self.category_keywords = {'Artificial Intelligence': ['ai', 'machine learning', 'deep learning', 'neural network'], 'Mobile': ['mobile', 'android', 'ios', 'app store', 'smartphone'], 'E-Commerce': ['ecommerce', 'e-commerce', 'shopping', 'online store'], 'FinTech': ['finance', 'banking', 'payments', 'fintech', 'crypto', 'blockchain'], 'Healthcare': ['health', 'medical', 'hospital', 'doctor', 'pharma'], 'Social Media': ['social network', 'community', 'messaging', 'chat'], 'Gaming': ['game', 'gaming', 'video game', 'esports'], 'Cloud': ['cloud', 'saas', 'paas', 'infrastructure'], 'EdTech': ['education', 'learning', 'students', 'teaching', 'school'], 'Data Analytics': ['analytics', 'data science', 'big data', 'insights']}

    def guess_category_from_tagline(self, tagline):
        tagline = str(tagline).lower()
        matched = [cat for (cat, keywords) in self.category_keywords.items() if any((keyword in tagline for keyword in keywords))]
        if len(matched) == 0:
            matched = ['Software', 'Advertising']
        elif len(matched) == 1:
            matched.append('Software')
        return ', '.join(matched)

    def fit(self, data):
        return self

    def transform(self, data):
        df = data.copy()
        df['Tagline'] = df['Tagline'].fillna('')
        df['Market Categories'] = df['Market Categories'].fillna('Unknown')
        df['Market Categories'] = df.apply(lambda row: self.guess_category_from_tagline(row['Tagline']) if str(row['Market Categories']).strip().lower() in ['unknown', 'nan', 'none', ''] else row['Market Categories'], axis=1)
        return df

    def fit_transform(self, data):
        return self.fit(data).transform(data)
class MarketCategoryGeneralizer:

    def __init__(self):
        self.category_mapping = {'Software': 'Technology & Software', 'Advertising': 'Advertising & Marketing', 'E-Commerce': 'E-Commerce & Online Services', 'Mobile': 'Mobile & Consumer Electronics', 'Games': 'Games & Entertainment', 'Social Media': 'Social Networking & Communication', 'Cloud': 'Technology & Software', 'Finance': 'Finance & Payments', 'Healthcare': 'Healthcare & Wellness', 'Semiconductors': 'Technology Hardware', 'Data Analytics': 'Analytics & Data Science', 'Search': 'Advertising & Marketing', 'Video': 'Games & Entertainment', 'Networking': 'Telecom & Networks', 'Messaging': 'Social Networking & Communication', 'Education': 'Education & Learning', 'News': 'Media & News', 'Photo Sharing': 'Digital Media & Content', 'Mobile Payments': 'Finance & Payments', 'Robotics': 'Games & Entertainment', 'Music': 'Games & Entertainment', 'Photo Editing': 'Digital Media & Content', 'Online Rental': 'E-Commerce & Online Services', 'Location Based Services': 'Telecom & Networks', 'Enterprise Software': 'Technology & Software', 'Video Streaming': 'Games & Entertainment', 'PaaS': 'Technology & Software', 'SaaS': 'Technology & Software', 'Health and Wellness': 'Healthcare & Wellness', 'Web Hosting': 'Technology & Software', 'Internet of Things': 'IoT (Internet of Things)', 'Cloud Security': 'Technology & Software', 'Virtual Currency': 'Finance & Payments', 'Search Marketing': 'Advertising & Marketing', 'Mobile Social': 'Social Networking & Communication', 'Retail': 'Retail & Fashion', 'Consulting': 'Others & Miscellaneous', 'Aerospace': 'Others & Miscellaneous', 'Food Delivery': 'Consumer Goods & Services', 'Fashion': 'Retail & Fashion', 'Wine And Spirits': 'Consumer Goods & Services', 'Streaming': 'Games & Entertainment', 'Task Management': 'Others & Miscellaneous', 'Video Chat': 'Social Networking & Communication', 'Personalization': 'Advertising & Marketing', 'Shopping': 'E-Commerce & Online Services', 'Local': 'E-Commerce & Online Services', 'News': 'Media & News', 'Fraud Detection': 'Advertising & Marketing', 'Image Recognition': 'Technology Hardware', 'Virtualization': 'Games & Entertainment', 'Analytics': 'Analytics & Data Science', 'Video on Demand': 'Games & Entertainment', 'Mobile Payments': 'Finance & Payments', 'Marketing Automation': 'Advertising & Marketing', 'Consumer Electronics': 'Mobile & Consumer Electronics', 'Video Games': 'Games & Entertainment', 'Public Relations': 'Advertising & Marketing'}

    def map_categories(self, row):
        categories = str(row).split(',')
        generalized = []
        for cat in categories:
            cat = cat.strip()
            if cat in self.category_mapping:
                generalized.append(self.category_mapping[cat])
            else:
                generalized.append('Others & Miscellaneous')
        return ', '.join(set(generalized))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        df['Generalized Market Categories'] = df['Market Categories'].fillna('').apply(self.map_categories)
        return df

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
class CountryRegionFiller:

    def __init__(self):
        self.countries = [country.name for country in pycountry.countries]
        self.regions = ['California', 'New York', 'Texas', 'Basel', 'Utah', 'Île-de-France', 'Bavaria', 'Ontario', 'Switzerland', 'United States', 'France', 'Great Britain', 'Israel', 'Sweden', 'Canada', 'Germany', 'Japan', 'India', 'Denmark', 'China', 'Spain', 'Netherlands', 'Finland', 'Australia', 'Ireland', 'United Stats of AMerica', 'United Arab Emirates', 'Quebec']

    def find_place(self, text, place_list):
        for place in place_list:
            if re.search('\\b' + re.escape(place) + '\\b', str(text)):
                return place
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        for (idx, row) in df[df['Country (HQ)'].isnull() | df['State / Region (HQ)'].isnull()].iterrows():
            desc = row['Description']
            if pd.isnull(desc):
                continue
            country = self.find_place(desc, self.countries)
            region = self.find_place(desc, self.regions)
            if pd.isnull(row['Country (HQ)']) and country:
                df.at[idx, 'Country (HQ)'] = country
            if pd.isnull(row['State / Region (HQ)']) and region:
                df.at[idx, 'State / Region (HQ)'] = region
        return df

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
class CategoricalFillerAndEncoder:

    def __init__(self, columns):
        self.columns = columns
        self.modes = {}
        self.label_encoders = {}
        self.label_maps = {}

    def fit(self, X, y=None):
        for col in self.columns:
            mode_val = X[col].mode()[0]
            self.modes[col] = mode_val
            le = LabelEncoder()
            filled = X[col].fillna(mode_val).astype(str)
            le.fit(filled)
            self.label_encoders[col] = le
            self.label_maps[col] = {label: i for (i, label) in enumerate(le.classes_)}
        return self

    def transform(self, X):
        df = X.copy()
        for col in self.columns:
            mode_val = self.modes[col]
            label_map = self.label_maps[col]
            df[col] = df[col].fillna(mode_val).astype(str)
            df[col + '_LabelEncoded'] = df[col].map(lambda x: label_map.get(x, -1))
        return df

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
class CustomEncoder(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.status_cols_ = None
        self.terms_cols_ = None

    def fit(self, df):
        self.status_cols_ = df['Status'].unique()
        self.terms_cols_ = df['Terms'].unique()
        return self

    def transform(self, df):
        df = df.copy()
        df = pd.get_dummies(df, columns=['Status'], drop_first=False)
        df = pd.get_dummies(df, columns=['Terms'], drop_first=False)
        if 'Terms_Cash, Stock' in df.columns:
            cash_stock_mask = df['Terms_Cash, Stock'] == 1
            df.loc[cash_stock_mask, 'Terms_Cash'] = 1
            df.loc[cash_stock_mask, 'Terms_Stock'] = 1
            df = df.drop('Terms_Cash, Stock', axis=1)
        expected_cols = [f'Status_{s}' for s in self.status_cols_] + [f'Terms_{t}' for t in self.terms_cols_ if t != 'Cash, Stock']
        for col in expected_cols:
            if col not in df.columns:
                df[col] = 0
        return df[expected_cols]
# Load all saved objects
objects_to_load = [
    'mlb_acquiring.pkl', 'correlation_filter.pkl', 'category_reducer_acquiring.pkl',
    'Age_column_acquiring.pkl', 'ipo_transformer_acquiring.pkl', 'employee_cleaner_acquiring.pkl',
    'board_members_transformer.pkl', 'founders_transformer.pkl', 'tfidf_acquiring.pkl',
    'tagline_guesser_acquired.pkl', 'category_generalizer_acquired.pkl',
    'country_region_filler_acquired.pkl', 'categorical_encoder_acquired.pkl',
    'mlb_acquired.pkl', 'mode_acquisitions_imputer.pkl', 'final_scaler.pkl',
    'final_pca.pkl', 'custom_acquisitions_encoder.pkl', 'label_encoders.pkl',
    'selected_features.pkl'
]

loaded_objects = {}
for obj in objects_to_load:
    with open(f'{obj}', 'rb') as f:
        loaded_objects[obj.split('.')[0]] = pickle.load(f)

# Load and preprocess acquiring company data
acquiring_df = pd.read_csv('data/Acquiring Tech Companies.csv')
# Apply transformers in order (simplified for brevity)
acquiring_df = loaded_objects['Age_column'].transform(acquiring_df)
acquiring_df = loaded_objects['ipo_transformer'].transform(acquiring_df)
# Continue with other transformers as in training, e.g., employee_cleaner, mlb_acquiring, etc.

# Similarly preprocess acquired_df
acquired_df = pd.read_csv('data/Acquired Tech Companies.csv')
acquired_df = loaded_objects['tagline_guesser'].transform(acquired_df)
# Continue with other transformers

# Load test acquisitions data
test_df = pd.read_csv('path_to_test_csv')  # User to provide path

# Preprocess test_df
with open('mode_acquisitions_imputer.pkl', 'rb') as f:
    mode_values = pickle.load(f)
for col, mode_val in mode_values.items():
    if col in test_df.columns:
        test_df[col].fillna(mode_val, inplace=True)

# Apply custom encoder for Status and Terms
test_df = loaded_objects['custom_acquisitions_encoder'].transform(test_df)

# Extract date features
test_df['Deal_date'] = pd.to_datetime(test_df['Deal announced on'], dayfirst=True, errors='coerce')
test_df['Deal_day'] = test_df['Deal_date'].dt.day
test_df['Deal_month'] = test_df['Deal_date'].dt.month
test_df['Deal_dayofweek'] = test_df['Deal_date'].dt.dayofweek
test_df.drop(columns=['Deal announced on', 'Deal_date'], inplace=True)

# Merge with preprocessed company data
def find_company_column(df):
    for col in df.columns:
        if "company" in col.lower():
            return col
    raise ValueError("No company name column found.")

company_col_acquired = find_company_column(acquired_df)
test_df['Acquired Company'] = test_df['Acquired Company'].str.strip().str.lower()
acquired_df[company_col_acquired] = acquired_df[company_col_acquired].str.strip().str.lower()
test_df = test_df.merge(acquired_df, how='left', left_on='Acquired Company', right_on=company_col_acquired, suffixes=('', '_Acquired'))
test_df.drop(columns=[company_col_acquired], inplace=True)

company_col_acquiring = find_company_column(acquiring_df)
test_df['Acquiring Company'] = test_df['Acquiring Company'].str.strip().str.lower()
acquiring_df[company_col_acquiring] = acquiring_df[company_col_acquiring].str.strip().str.lower()
test_df = test_df.merge(acquiring_df, how='left', left_on='Acquiring Company', right_on=company_col_acquiring, suffixes=('', '_Acquiring'))
test_df.drop(columns=[company_col_acquiring], inplace=True)

# Handle missing values post-merge
for col in test_df.columns:
    if col in mode_values:
        test_df[col].fillna(mode_values[col], inplace=True)

# Encode categorical columns
with open('label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)
for col in test_df.columns:
    if col in label_encoders:
        le = label_encoders[col]
        test_df[col] = test_df[col].astype(str).map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

# Select features
with open('selected_features.pkl', 'rb') as f:
    selected_features = pickle.load(f)
selected_features.remove('Deal size class')  # Assuming it's included
test_df = test_df[selected_features]

# Scale and apply PCA
scaler = loaded_objects['final_scaler']
pca = loaded_objects['final_pca']
test_scaled = scaler.transform(test_df)
test_scaled = pd.DataFrame(test_scaled, columns=test_df.columns)
test_pca = pca.transform(test_scaled)

# Load model and predict
with open('best_gb_model.pkl', 'rb') as f:
    model = pickle.load(f)
predictions = model.predict(test_pca)

# Output predictions
test_df['predicted_class'] = predictions
test_df.to_csv('predictions.csv', index=False)

# If true labels are provided, calculate accuracy
if 'Deal size class' in test_df.columns:
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(test_df['Deal size class'], predictions)
    print(f"Accuracy: {accuracy:.3f}")
    print("Note: R2 and MSE are regression metrics; for classification, accuracy is provided.")

FileNotFoundError: [Errno 2] No such file or directory: 'label_encoders.pkl'