In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [5]:
def preprocess_data(data):
    # Remove the last column (Species) and store it in a separate variable
    species = data.iloc[:, -1]
    data = data.iloc[:, :-1]

    # Perform any necessary preprocessing steps
    # For example, handle missing values, convert categorical variables, etc.
    # This function should return the preprocessed data and the species variable
    # Example: Data normalization
    data_normalized = (data - data.min()) / (data.max() - data.min())
    return data_normalized, species


In [6]:
def calculate_distance(x1, x2):
    # Calculate the Euclidean distance between two data points
    return np.sqrt(np.sum((x1 - x2) ** 2))


In [7]:
def initialize_centroids(data, k):
    # Initialize the centroids randomly from the data points
    indices = np.random.choice(len(data), k, replace=False)
    centroids = data[indices]
    return centroids

In [8]:
def assign_clusters(data, centroids):
    # Assign data points to the nearest centroid
    clusters = np.zeros(len(data))

    for i in range(len(data)):
        distances = [calculate_distance(data[i], centroid) for centroid in centroids]
        cluster = np.argmin(distances)
        clusters[i] = cluster

    return clusters

In [9]:
def update_centroids(data, clusters, k):
    # Update the centroids based on the mean of the data points in each cluster
    centroids = np.zeros((k, data.shape[1]))

    for cluster in range(k):
        cluster_points = data[clusters == cluster]
        centroid = np.mean(cluster_points, axis=0)
        centroids[cluster] = centroid

    return centroids

In [10]:
def k_means_clustering(data, k, max_iterations=100):
    # Perform K-Means Clustering on the data
    centroids = initialize_centroids(data, k)
    prev_centroids = None
    iterations = 0

    while iterations < max_iterations and not np.array_equal(centroids, prev_centroids):
        clusters = assign_clusters(data, centroids)
        prev_centroids = centroids
        centroids = update_centroids(data, clusters, k)
        iterations += 1

    return clusters, centroids

In [11]:
def plot_clusters(data, clusters, centroids):
    # Plot the data points and centroids based on the clusters
    colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
    for cluster in range(len(np.unique(clusters))):
        cluster_points = data[clusters == cluster]
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=colors[cluster], label=f'Cluster {cluster + 1}')

    plt.scatter(centroids[:, 0], centroids[:, 1], color='black', marker='*', label='Centroids')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('K-Means Clustering')
    plt.legend()
    plt.show()

In [12]:
def calculate_covariance_matrix(data):
    # Calculate the covariance matrix of the data
    n = data.shape[0]
    covariance_matrix = np.dot(data.T, data) / n
    return covariance_matrix


In [13]:
def perform_pca(data):
    # Perform Principal Component Analysis (PCA) on the data
    # This function should return the eigenvectors and eigenvalues
    # Example: Calculate the covariance matrix
    covariance_matrix = calculate_covariance_matrix(data)

    # Calculate the eigenvectors and eigenvalues
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

    # Sort the eigenvalues and eigenvectors in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    return eigenvalues, eigenvectors

In [14]:
def plot_pca(data, species, eigenvectors):
    # Plot the data points in the first three eigenvectors
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    # Map the species labels to colors
    species_mapping = {label: i for i, label in enumerate(np.unique(species))}
    colors = [species_mapping[label] for label in species]

    # Project the data points onto the first three eigenvectors
    projected_data = data.dot(eigenvectors[:, :3])

    # Plot the data points
    ax.scatter(projected_data[:, 0], projected_data[:, 1], projected_data[:, 2], c=colors, cmap='viridis')

    # Add labels and title
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.set_zlabel('PC3')
    ax.set_title('Principal Component Analysis')

    plt.show()

In [None]:
def k_means_and_pca(dataset):
    # Load the dataset
    data = pd.read_csv(dataset)

    # Preprocess the data
    data_preprocessed, species = preprocess_data(data)

    # Perform K-Means Clustering
    clusters, centroids = k_means_clustering(data_preprocessed.values, k=3)

    # Plot the clusters
    plot_clusters(data_preprocessed.values, clusters, centroids)

    # Perform PCA
    eigenvalues, eigenvectors = perform_pca(data_preprocessed.values)

    # Plot the PCA results
    plot_pca(data_preprocessed.values, species.values, eigenvectors)

    return clusters, eigenvectors


# Example usage
dataset = "Iris Dataset.csv"

# Perform K-Means Clustering and PCA
clusters, eigenvectors = k_means_and_pca(dataset)

# Print the clusters and the first three eigenvectors
print("Clusters:", clusters)
print("Eigenvectors:")
print(eigenvectors[:, :3])
