<a href="https://colab.research.google.com/github/fatemeh-kn/image-processing/blob/main/PCA_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
def pca_analysis_scatter(excel_file, n_components=4):
    """
    Reads an Excel file, performs PCA analysis for the specified number of components,
    creates scatter plots of the principal components (PCs), highlights selected points
    across all plots, and prints the names of selected samples.

    Args:
        excel_file (str): Path to the Excel file.
        n_components (int, optional): Number of principal components to analyze. Defaults to 4.
    """

    # Read the Excel file
    data = pd.read_excel(excel_file)

    # Select the main features (assuming numerical columns)
    features = data.select_dtypes(include=[np.number])

    # Perform PCA analysis
    pca = PCA(n_components=n_components)
    pcs = pca.fit_transform(features)
    print(pcs.)
    print(pca.explained_variance_ratio_)
    # Create scatter plots for each PC
    fig, axes = plt.subplots(2, 2, figsize=(10, 10))  # Adjust figure size as needed

    selected_points = []  # List to store indices of selected points

    for i in range(n_components):
        for j in range(i + 1, n_components):
            ax = axes[i // 2, i % 2]
            ax.scatter(pcs[:, i], pcs[:, j])
            ax.set_title(f"PC{i+1} vs. PC{j+1}")

            # Click event handler to select/highlight points
            def on_click(event):
                if event.inaxes:
                    x, y = event.xdata, event.ydata
                    selected_points.append((i, j, x, y))
                    for ax_ in axes.flatten():
                        ax_.scatter(x, y, c='red', s=80, zorder=10)
                        ax_.set_xlim(ax_.get_xlim())  # Reset limits to avoid clipping
                        ax_.set_ylim(ax_.get_ylim())

            cid = fig.canvas.mpl_connect('button_press_event', on_click)

    # Print the names of selected samples (assuming a 'Sample Name' column)
    if 'Sample Name' in data.columns:
        if selected_points:
            print("Selected samples (based on 'Sample Name' column):")
            for i, j, x, y in selected_points:
                sample_name = data.iloc[np.where((pcs[:, i] == x) & (pcs[:, j] == y))[0].item()]['Sample Name']
                print(f"- PC{i+1} vs. PC{j+1}: {sample_name}")
        else:
            print("No points were selected.")
    else:
        print("The 'Sample Name' column is not found in the data. Sample names cannot be printed.")

    plt.tight_layout()
    plt.show()

# Example usage (replace 'your_data.xlsx' with your actual file path)
pca_analysis_scatter('your_data.xlsx')
