In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, make_scorer
import joblib  # Use 'joblib' for saving the best estimator
from statistics import stdev
from sklearn.model_selection import cross_val_score

# Load the data
X = pd.read_csv('PCA_R3_Noramlaized_ZScore.csv')
y = pd.read_csv('labels_train.csv')

# Assuming y is a DataFrame and you have multiple columns representing different outputs
# If not, modify the following line based on your actual data structure
y_train = y.values  # Convert DataFrame to a NumPy array

# Compute class weights for each label
class_weights = compute_class_weight('balanced', classes=np.unique(y_train.flatten()), y=y_train.flatten())

# Create a dictionary with class labels as keys and corresponding class weights as values
class_weight_dict = dict(zip(np.unique(y_train.flatten()), class_weights))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_train, test_size=0.95, random_state=42)

# Define individual classifiers # updated class imbalance condition
svm_classifier = SVC(probability=True, random_state=42, class_weight=class_weight_dict)
decision_tree_classifier = DecisionTreeClassifier(random_state=42, class_weight=class_weight_dict)

# Create a VotingClassifier
voting_classifier = VotingClassifier(
    estimators=[
        ('svm', svm_classifier),
        ('decision_tree', decision_tree_classifier)
    ],
    voting='soft'  # Use 'soft' voting for probability averaging
)

# Create a MultiOutputClassifier for the VotingClassifier
multioutput_voting_classifier = MultiOutputClassifier(voting_classifier)

# Define parameter grids for hyperparameter tuning
param_grid_decision_tree = {
    'estimator__decision_tree__max_depth': [3, 5, 7, 10],  # Adjust other hyperparameters as needed
}

# Create GridSearchCV for hyperparameter tuning for each classifier
grid_search_decision_tree = GridSearchCV(
    estimator=multioutput_voting_classifier,
    param_grid=param_grid_decision_tree,
    scoring=make_scorer(f1_score, average='micro'),
    cv=5,
    # n_jobs=-1  # Use all available processors for parallel processing
)

# Fit the GridSearchCVs to the training data
grid_search_decision_tree.fit(X_train, y_train)

# Save the best-tuned MultiOutputClassifier from GridSearchCVs using joblib
best_multioutput_classifier_decision_tree = grid_search_decision_tree.best_estimator_
joblib.dump(best_multioutput_classifier_decision_tree, 'best_model_decision_tree_with_std.joblib')

# Make predictions using the best-tuned classifiers
best_predictions_decision_tree = best_multioutput_classifier_decision_tree.predict(X_test)

# Calculate the F1 score for each class using the best-tuned classifiers
best_f1_scores_decision_tree = f1_score(y_test, best_predictions_decision_tree, average=None)

# Calculate the average F1 score across all classes using the best-tuned classifiers
best_average_f1_score_decision_tree = f1_score(y_test, best_predictions_decision_tree, average='micro')

# Print or use the F1 scores from the best-tuned classifiers
print("Best Decision Tree F1 scores for each class:", best_f1_scores_decision_tree)
print("Best Decision Tree Average F1 score:", best_average_f1_score_decision_tree)

# Calculate standard deviation of F1 scores for each class
f1_scores_std = stdev(best_f1_scores_decision_tree)
print("Standard Deviation of F1 scores for each class:", f1_scores_std)

# Print the best hyperparameters found by GridSearchCV for each classifier
print("Best Decision Tree Hyperparameters:", grid_search_decision_tree.best_params_)

# # Export the results to a CSV file
# results_df = pd.DataFrame({
#     'Class': range(1, len(best_f1_scores_decision_tree) + 1),
#     'F1 Score': best_f1_scores_decision_tree
# })
#deviation 
accuracy_on_dt = cross_val_score(best_multioutput_classifier_decision_tree, X_train, y_train, cv=5,scoring='f1_micro')
print("Accuracy is on training is:", accuracy_on_dt)
print(f'The standard deviation across the five accuracy measurements using best_multioutput_classifier_decision_tree : {accuracy_on_dt.std():.3f}')
print(f'The average accuracy across all five folds using best_multioutput_classifier_decision_tree: {accuracy_on_dt.mean():.3f}')

# results_df.to_csv('decision_tree_results.csv', index=False)

# Make predictions on the entire dataset (X) using the trained model
predicted_labels = best_multioutput_classifier_decision_tree.predict(X_test)

# Create a DataFrame with the predicted labels
predicted_labels_df = pd.DataFrame(data=predicted_labels, columns=y.columns)  # Assuming y has column names

# Save the DataFrame with predicted labels to a CSV file
predicted_labels_df.to_csv('predicted_labels_Final.csv', index=False)


print("Predicted labels have been saved to predicted_labels_Final.csv.")

