In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset
data_path = ''  # Update this path to your actual data file
data = pd.read_csv(data_path)

# Handle infinite values and missing data
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Assuming there's a column named 'label' for the target variable
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Var_smoothing parameters to tune
var_smoothing_params = [1e-09, 1e-08, 1e-07, 1e-06]

# Initialize the best score
best_score = 0
best_param = None

# Iterate over var_smoothing values
for var_smoothing in var_smoothing_params:
    # Create a pipeline with PCA and Naive Bayes
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        ('pca', PCA(n_components=25)),  # Reduce dimensionality
        ('nb', GaussianNB(var_smoothing=var_smoothing))  # Naive Bayes with current var_smoothing
    ])
    
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    
    # Update best score and parameter if current score is higher
    if accuracy > best_score:
        best_score = accuracy
        best_param = var_smoothing

print("Best Score:", best_score)
print("Best var_smoothing Parameter:", best_param)
print("Classification Report for Best Parameter:")
print(classification_report(y_test, pipeline.predict(X_test)))
