In [1]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

def preprocess_data(file_path):
    try:
        df = pd.read_excel(file_path)
        
        # Ensure all column names are strings
        df.columns = df.columns.astype(str)
        
        # Handle missing values for numeric columns only
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
        
        # Fill missing categorical values
        df.fillna('Unknown', inplace=True)
        
        # Encode categorical variables
        label_encoder = LabelEncoder()
        df['Label'] = label_encoder.fit_transform(df['Label'])
        
        # Normalize numerical features
        scaler = StandardScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
        
        return df
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None

# Define file paths
data_dir = 'data'
combined_file_names = ['combined_917.xlsx', 'combined_1725.xlsx', 'combined_all.xlsx']

# Preprocess each combined dataset
for combined_file_name in combined_file_names:
    combined_file_path = os.path.join(data_dir, combined_file_name)
    preprocessed_df = preprocess_data(combined_file_path)
    
    if preprocessed_df is not None:
        # Save preprocessed dataset
        preprocessed_file_path = os.path.join(data_dir, f"preprocessed_{combined_file_name}")
        preprocessed_df.to_excel(preprocessed_file_path, index=False)
        
        # Perform PCA on the preprocessed data
        pca = PCA(n_components=min(len(preprocessed_df.columns) - 1, len(preprocessed_df)))
        pca_result = pca.fit_transform(preprocessed_df.drop('Label', axis=1))
        
        # Create a DataFrame for PCA results
        pca_df = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
        pca_df['Label'] = preprocessed_df['Label']
        
        # Save PCA results to a new Excel file
        pca_file_path = os.path.join(data_dir, f"pca_{combined_file_name}")
        pca_df.to_excel(pca_file_path, index=False)