In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv(r'result.csv')


# Identify numeric and categorical columns
numeric_features = ['n_neighbor', 'p', 'Leaf size']  # Corrected numeric features
categorical_features = ['weights', 'Algorithm']  # Example categorical features

# Define transformers for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply transformations
X = data.drop(['Accuracy', 'F1 Score'], axis=1)  # Adjusted to drop both 'Accuracy' and 'F1 Score'
y = data[['Accuracy', 'F1 Score']]  # Corrected to select both 'Accuracy' and 'F1 Score' as target variables

# Preprocess the features
X_preprocessed = preprocessor.fit_transform(X)

# Retrieve column names for the one-hot encoded features
new_categorical_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
all_features = numeric_features + list(new_categorical_features)  # Combine all feature names

# Create a DataFrame with the new column names
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=all_features)

# Append the target variable to the DataFrame
X_preprocessed_df[['Accuracy', 'F1 Score']] = y  # Corrected to append both 'Accuracy' and 'F1 Score'

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_df, y, test_size=0.2, random_state=42)

# Save the entire dataset (features + target) to a single CSV file
X_preprocessed_df.to_csv('preprocessed1.csv', index=False)

# Now X_train and X_test are ready to use with a KNN model and saved for future use

(2224, 11)