In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/breast-cancer-001/breast_cancer.csv


# Principal Component Analysis (PCA) From Scratch Using NumPy

This notebook demonstrates the complete implementation of Principal Component
Analysis (PCA) from scratch using NumPy. No scikit-learn functions are used.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv("/kaggle/input/breast-cancer-001/breast_cancer.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,x.radius_mean,x.texture_mean,x.perimeter_mean,x.area_mean,x.smoothness_mean,x.compactness_mean,x.concavity_mean,x.concave_pts_mean,x.symmetry_mean,...,x.texture_worst,x.perimeter_worst,x.area_worst,x.smoothness_worst,x.compactness_worst,x.concavity_worst,x.concave_pts_worst,x.symmetry_worst,x.fractal_dim_worst,y
0,1,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,...,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259,B
1,2,13.08,15.71,85.63,520.0,0.1075,0.127,0.04568,0.0311,0.1967,...,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183,B
2,3,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,...,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773,B
3,4,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,...,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169,B
4,5,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,0.1769,...,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409,B


## Data Cleaning

Unnecessary columns such as index columns are removed from the dataset
to ensure clean data for further analysis.


In [4]:
df.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")


## Feature and Target Separation

The dataset is divided into:
- Feature matrix (X): Input variables
- Target vector (y): Class labels


In [5]:
X = df.drop(columns=["y"])
y = df["y"]


## Selection of Numerical Features

Only numerical features are retained since PCA operates exclusively
on numerical data.


In [6]:
X = X.select_dtypes(include=[np.number])


## Feature Standardization

All features are standardized to have zero mean and unit variance.
This ensures that all features contribute equally to PCA.


In [7]:
X_scaled = (X - X.mean()) / X.std()


## Mean Centering

The standardized data is mean-centered by subtracting the mean of each
feature. Mean centering is a prerequisite for PCA computation.


In [8]:
X_meaned = X_scaled.values - np.mean(X_scaled.values, axis=0)

## Covariance Matrix Computation

The covariance matrix is computed to understand how features vary
with respect to each other.


In [9]:
cov_matrix = np.cov(X_meaned, rowvar=False)


## Eigenvalue and Eigenvector Computation

Eigenvalues and eigenvectors are calculated from the covariance matrix.
Eigenvalues represent the variance explained by each principal component,
while eigenvectors represent their directions.


In [10]:
eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)


## Sorting Eigenvalues and Eigenvectors

Eigenvalues and corresponding eigenvectors are sorted in descending order
to prioritize components that explain the maximum variance.


In [11]:
sorted_idx = np.argsort(eigen_values)[::-1]

sorted_eigenvalues = eigen_values[sorted_idx]
sorted_eigenvectors = eigen_vectors[:, sorted_idx]


## Explained Variance Analysis

The explained variance ratio and cumulative variance are calculated
to determine how much information is retained by the principal components.


In [12]:
explained_variance_ratio = sorted_eigenvalues / np.sum(sorted_eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

cumulative_variance


array([0.44272026, 0.63243208, 0.72636371, 0.79238506, 0.84734274,
       0.88758796, 0.9100953 , 0.92598254, 0.93987903, 0.95156881,
       0.961366  , 0.97007138, 0.97811663, 0.98335029, 0.98648812,
       0.98915022, 0.99113018, 0.99288414, 0.9945334 , 0.99557204,
       0.99657114, 0.99748579, 0.99829715, 0.99889898, 0.99941502,
       0.99968761, 0.99991763, 0.99997061, 0.99999557, 1.        ])

## Selection of Principal Components

The minimum number of principal components required to retain
95% of the total variance is selected.


In [13]:
n_components = np.argmax(cumulative_variance >= 0.95) + 1
n_components


np.int64(10)

## Projection onto Principal Components

The original data is projected onto the selected principal components,
resulting in a reduced-dimensional representation of the dataset.


In [14]:
projection_matrix = sorted_eigenvectors[:, :n_components]

X_pca_manual = X_meaned.dot(projection_matrix)

X_pca_manual.shape


(569, 10)

## Conclusion

Principal Component Analysis (PCA) was successfully implemented from scratch
using NumPy. The algorithm involved standardization, covariance matrix
computation, eigenvalue decomposition, and projection onto principal components.
No scikit-learn functions were used in this implementation.
