In [None]:
import random

# Stub evaluation function that returns a random value
def evaluate(features):
    return random.uniform(0.25, 0.75) * 100  # returns a percentage value

# Forward Selection with detailed trace
def forward_selection(all_features):
  #Algorithm starts with no selected features & best score of 0
    selected_features = []
    best_score = 0
    improvement = True

    print("\nYou have selected Forward Selection\n")

    # Initial random accuracy with no features
    initial_score = evaluate(selected_features)
    print(f"Using no features and “random” evaluation, I get an accuracy of {initial_score:.1f}%\n")

    iteration = 1
    while improvement:
        print(f"Beginning search.\n")
        improvement = False
        best_candidate = None
        candidates = []
        for feature in all_features:
            if feature not in selected_features:
                candidate_features = selected_features + [feature]
                score = evaluate(candidate_features)
                candidates.append((feature, score))

        for feature, score in candidates:
            print(f"Using feature(s) {selected_features + [feature]} accuracy is {score:.1f}%")
            if score > best_score:
                best_score = score
                best_candidate = feature
                improvement = True

        if best_candidate:
            selected_features.append(best_candidate)
            print(f"\nFeature set {selected_features} was best, accuracy is {best_score:.1f}%\n")
        iteration += 1


    print(f"Finished search!! The best feature subset is {selected_features}, which has an accuracy of {best_score:.1f}%")

# Backward Elimination with detailed trace

#Feature selection technique that iteratively removes the worst(least significant feature) to improve model
def backward_elimination(all_features):
  #initialize with all features and evaluate accuracy
    selected_features = list(all_features)
    best_score = evaluate(selected_features)

    print("\nYou have selected Backward Elimination\n")
    print(f"Using all features and “random” evaluation, I get an accuracy of {best_score:.1f}%\n")

    improvement = True
    while improvement:
        print(f"Beginning search.\n")
        #reset improvement flag, worst candidate to none and initialize candidate list
        improvement = False
        worst_candidate = None
        candidates = []

        #iterate over selected features, evaluate accuracy with candidate set and append the score to the list
        for feature in selected_features:
            candidate_features = [f for f in selected_features if f != feature]
            score = evaluate(candidate_features)
            candidates.append((feature, score))

            #iterate over features and their scores,
        for feature, score in candidates:
            print(f"Using feature(s) {[f for f in selected_features if f != feature]} accuracy is {score:.1f}%")
            #if current candidate improves score, update best score, update worst candidate and set improvement flag
            if score > best_score:
                best_score = score
                worst_candidate = feature
                improvement = True

        #if there is an improvement, remove the worst candidate feature
        if worst_candidate:
            selected_features.remove(worst_candidate)
            print(f"\nFeature set {selected_features} was best, accuracy is {best_score:.1f}%\n")


    print(f"The best feature subset is {selected_features}, which has an accuracy of {best_score:.1f}%")

# Main program to ask user for input and run the chosen algorithm
def main():
    all_features = ['1', '2', '3', '4']

    print("Welcome to Our CS170 Group's Feature Selection Algorithm.")
    print(f"Please enter total number of features: {len(all_features)}")
    print("Type the number of the algorithm you want to run.")
    print("\n1. Forward Selection\n2. Backward Elimination\n")

    choice = input("Enter your choice (1 for Forward Selection, 2 for Backward Elimination): ").strip()

    if choice == '1':
        forward_selection(all_features)
    elif choice == '2':
        backward_elimination(all_features)
    else:
        print("Invalid choice. Please enter 1 or 2.")

if __name__ == "__main__":
    main()


Welcome to Our CS170 Group's Feature Selection Algorithm.
Please enter total number of features: 4
Type the number of the algorithm you want to run.

1. Forward Selection
2. Backward Elimination

Enter your choice (1 for Forward Selection, 2 for Backward Elimination): 1

You have selected Forward Selection

Using no features and “random” evaluation, I get an accuracy of 54.8%

Beginning search.

Using feature(s) ['1'] accuracy is 44.4%
Using feature(s) ['2'] accuracy is 55.3%
Using feature(s) ['3'] accuracy is 55.9%
Using feature(s) ['4'] accuracy is 31.0%

Feature set ['3'] was best, accuracy is 55.9%

Beginning search.

Using feature(s) ['3', '1'] accuracy is 64.0%
Using feature(s) ['3', '2'] accuracy is 74.0%
Using feature(s) ['3', '4'] accuracy is 32.8%

Feature set ['3', '2'] was best, accuracy is 74.0%

Beginning search.

Using feature(s) ['3', '2', '1'] accuracy is 71.7%
Using feature(s) ['3', '2', '4'] accuracy is 64.9%
Finished search!! The best feature subset is ['3', '2'], w

In [None]:
import numpy as np
from scipy.spatial import distance

class NNClassifier:
    def __init__(self):
        self.training_data = None
        self.training_labels = None

    def train(self, training_data, training_labels):
        """Store the training data and labels."""
        self.training_data = training_data
        self.training_labels = training_labels

    def test(self, test_instance):
        """Return the class label of the nearest training instance."""
        if self.training_data is None or self.training_labels is None:
            raise ValueError("The classifier has not been trained yet.")

        # Calculate Euclidean distances from the test instance to all training instances
        distances = distance.cdist([test_instance], self.training_data, 'euclidean')
        nearest_index = np.argmin(distances)
        return self.training_labels[nearest_index]

    def validate(self, feature_subset, data, labels):
        """Calculate the accuracy of the classifier given a specific feature subset."""
        if data is None or labels is None:
            print("Unable to validate: Data or labels are missing.")
            return 0.0

        selected_data = data[:, feature_subset]
        num_instances = selected_data.shape[0]
        correct_predictions = 0

        for i in range(num_instances):
            # Leave-one-out validation: Train on all instances except the i-th one and test on the i-th one
            train_data = np.delete(selected_data, i, axis=0)
            train_labels = np.delete(labels, i, axis=0)
            test_instance = selected_data[i]
            true_label = labels[i]

            self.train(train_data, train_labels)
            predicted_label = self.test(test_instance)

            if predicted_label == true_label:
                correct_predictions += 1

        accuracy = (correct_predictions / num_instances) * 100
        return accuracy

def load_dataset(file_path):
    try:
        # Load the entire dataset
        data = np.genfromtxt(file_path)

        # Extract labels and features separately
        labels = data[:, 0].astype(int)
        features = data[:, 1:]

        return features, labels

    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None, None
    except Exception as e:
        print(f"Error loading dataset from {file_path}: {e}")
        return None, None

# Example usage
if __name__ == "__main__":
    # Load small test dataset
    small_dataset_path = '/content/small-test-dataset.txt'
    data, labels = load_dataset(small_dataset_path)
    feature_subset = [2, 4, 6]  # zero-indexed for features {3, 5, 7}

    classifier = NNClassifier()
    accuracy = classifier.validate(feature_subset, data, labels)

    print(f"Accuracy of NN classifier with feature subset {feature_subset} on small dataset: {accuracy:.2f}%")

    # Load large test dataset
    large_dataset_path = '/content/large-test-dataset.txt'
    data, labels = load_dataset(large_dataset_path)
    feature_subset = [0, 14, 26]  # zero-indexed for features {1, 15, 27}

    accuracy = classifier.validate(feature_subset, data, labels)

    print(f"Accuracy of NN classifier with feature subset {feature_subset} on large dataset: {accuracy:.2f}%")

Accuracy of NN classifier with feature subset [2, 4, 6] on small dataset: 89.00%
Accuracy of NN classifier with feature subset [0, 14, 26] on large dataset: 94.90%


In [None]:
import numpy as np
from scipy.spatial import distance

class NNClassifier:
    def __init__(self):
        self.training_data = None
        self.training_labels = None

    def train(self, training_data, training_labels):
        """Store the training data and labels."""
        self.training_data = training_data
        self.training_labels = training_labels

    def test(self, test_instance):
        """Return the class label of the nearest training instance."""
        if self.training_data is None or self.training_labels is None:
            raise ValueError("The classifier has not been trained yet.")

        # Calculate Euclidean distances from the test instance to all training instances
        distances = distance.cdist([test_instance], self.training_data, 'euclidean')
        nearest_index = np.argmin(distances)
        return self.training_labels[nearest_index]

    def validate(self, feature_subset, data, labels):
        """Calculate the accuracy of the classifier given a specific feature subset."""
        if data is None or labels is None:
            print("Unable to validate: Data or labels are missing.")
            return 0.0

        selected_data = data[:, feature_subset]
        num_instances = selected_data.shape[0]
        correct_predictions = 0

        for i in range(num_instances):
            # Leave-one-out validation: Train on all instances except the i-th one and test on the i-th one
            train_data = np.delete(selected_data, i, axis=0)
            train_labels = np.delete(labels, i, axis=0)
            test_instance = selected_data[i]
            true_label = labels[i]

            self.train(train_data, train_labels)
            predicted_label = self.test(test_instance)

            if predicted_label == true_label:
                correct_predictions += 1

        accuracy = (correct_predictions / num_instances) * 100
        return accuracy

def load_dataset(file_path):
    try:
        # Load the entire dataset
        data = np.genfromtxt(file_path)

        # Extract labels and features separately
        labels = data[:, 0].astype(int)
        features = data[:, 1:]

        return features, labels

    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None, None
    except Exception as e:
        print(f"Error loading dataset from {file_path}: {e}")
        return None, None

# Updated evaluation function using NNClassifier
def evaluate(feature_subset, features, labels):
    classifier = NNClassifier()
    classifier.train(features, labels)  # Initialize the classifier with training data and labels
    return classifier.validate(feature_subset, features, labels)


# Forward Selection with detailed trace
def forward_selection(all_features, features, labels):
    selected_features = []
    best_score = 0
    improvement = True

    print("\nYou have selected Forward Selection\n")

    # Initial accuracy with no features
    initial_score = evaluate([], features, labels)
    print(f"Using no features and evaluation, I get an accuracy of {initial_score:.1f}%\n")

    while improvement:
        print("Beginning search.\n")
        improvement = False
        best_candidate = None
        candidates = []
        for feature in all_features:
            if feature not in selected_features:
                candidate_features = selected_features + [feature]
                score = evaluate(candidate_features, features, labels)
                candidates.append((feature, score))

        for feature, score in candidates:
            print(f"Using feature(s) {selected_features + [feature]} accuracy is {score:.1f}%")
            if score > best_score:
                best_score = score
                best_candidate = feature
                improvement = True

        if best_candidate:
            selected_features.append(best_candidate)
            print(f"\nFeature set {selected_features} was best, accuracy is {best_score:.1f}%\n")

    print(f"Finished search!! The best feature subset is {selected_features}, which has an accuracy of {best_score:.1f}%")

# Backward Elimination with detailed trace
def backward_elimination(all_features, features, labels):
    selected_features = list(all_features)
    best_score = evaluate(selected_features, features, labels)

    print("\nYou have selected Backward Elimination\n")
    print(f"Using all features and evaluation, I get an accuracy of {best_score:.1f}%\n")

    improvement = True
    while improvement:
        print("Beginning search.\n")
        improvement = False
        worst_candidate = None
        candidates = []

        for feature in selected_features:
            candidate_features = [f for f in selected_features if f != feature]
            score = evaluate(candidate_features, features, labels)
            candidates.append((feature, score))

        for feature, score in candidates:
            print(f"Using feature(s) {[f for f in selected_features if f != feature]} accuracy is {score:.1f}%")
            if score > best_score:
                best_score = score
                worst_candidate = feature
                improvement = True

        if worst_candidate:
            selected_features.remove(worst_candidate)
            print(f"\nFeature set {selected_features} was best, accuracy is {best_score:.1f}%\n")

    print(f"The best feature subset is {selected_features}, which has an accuracy of {best_score:.1f}%")

# Main program to ask user for input and run the chosen algorithm
def main():
    small_dataset_path = '/content/CS170_Spring_2024_Small_data__63.txt'  # Replace with your small dataset path
    large_dataset_path = '/content/CS170_Spring_2024_Large_data__63.txt'  # Replace with your large dataset path

    features1, labels1 = load_dataset(small_dataset_path)
    features2, labels2 = load_dataset(large_dataset_path)

    if features1 is None or labels1 is None or features2 is None or labels2 is None:
        return

    print("Welcome to Our CS170 Group's Feature Selection Algorithm.")
    print("Please select the dataset you want to use:")
    print("1. Small Dataset")
    print("2. Large Dataset")

    dataset_choice = input("Enter your choice (1 for Small Dataset, 2 for Large Dataset): ").strip()

    if dataset_choice == '1':
        features = features1
        labels = labels1
    elif dataset_choice == '2':
        features = features2
        labels = labels2
    else:
        print("Invalid choice. Please enter 1 or 2.")
        return

    all_features = list(range(features.shape[1]))

    print(f"Please enter total number of features: {len(all_features)}")
    print("Type the number of the algorithm you want to run.")
    print("\n1. Forward Selection\n2. Backward Elimination\n")

    choice = input("Enter your choice (1 for Forward Selection, 2 for Backward Elimination): ").strip()

    if choice == '1':
        forward_selection(all_features, features, labels)
    elif choice == '2':
        backward_elimination(all_features, features, labels)
    else:
        print("Invalid choice. Please enter 1 or 2.")

if __name__ == "__main__":
    main()


Welcome to Our CS170 Group's Feature Selection Algorithm.
Please select the dataset you want to use:
1. Small Dataset
2. Large Dataset
Enter your choice (1 for Small Dataset, 2 for Large Dataset): 1
Please enter total number of features: 10
Type the number of the algorithm you want to run.

1. Forward Selection
2. Backward Elimination

