In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

data_path = 'data.csv'


df = pd.read_csv(data_path, index_col=0)
print(f"dataset shape: {df.shape}")

index_values = df.index.astype(str)

def extract_label(index_str):
    parts = index_str.split('_')
    if len(parts) > 0:
        cancer_types = ['BRCA', 'KIRC', 'COAD', 'LUAD', 'PRAD']
        for cancer in cancer_types:
            if cancer in index_str.upper():
                return cancer
    return f"group_{int(index_str) % 5}" if index_str.isdigit() else "Unknown"

y = pd.DataFrame({
    'label': [extract_label(idx) for idx in index_values]
}, index=df.index)

X = df.copy()

print(f"labels extracted: {y['Label'].unique()}")
print(f"label distribution:\n{y['Label'].value_counts()}")

print(f"dataset Dimensions: {X.shape}")
print(f"no of samples: {X.shape[0]}")
print(f"no of features (genes): {X.shape[1]}")

missing_count = X.isnull().sum().sum()
print(f"total missing values: {missing_count}")

print(f"min value: {X.values.min():.6f}")
print(f"ax value: {X.values.max():.6f}")
print(f"mean value: {X.values.mean():.6f}")
print(f"median value: {np.median(X.values):.6f}")

initial_shape = X.shape

for col in X.columns:
    if X[col].isnull().sum() > 0:
        median_val = X[col].median()
        X[col].fillna(median_val, inplace=True)

y = y.loc[X.index]
print(f"shape before: {initial_shape}")
print(f"shape after: {X.shape}")
print(f"Rows removed: {initial_shape[0] - X.shape[0]}")


Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1

multiplier = 1.5
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR

outliers = ((X < lower_bound) | (X > upper_bound)).any(axis=1)
n_outliers = outliers.sum()

print(f"no of outlier samples detected: {n_outliers}")
print(f"Percentage of outliers: {(n_outliers/len(X))*100:.2f}%")


X_clipped = X.copy()
for col in X.columns:
    X_clipped[col] = np.clip(X_clipped[col], lower_bound[col], upper_bound[col])

X = X_clipped

variances = X.var()
constant_features = variances[variances == 0].index.tolist()
print(f"no of constant features: {len(constant_features)}")

if len(constant_features) > 0:
    X = X.drop(columns=constant_features)


print(f"Shape after removing constant features: {X.shape}")

initial_shape = X.shape
variances = X.var()
total_variance = variances.sum()
variance_ratio = variances / total_variance
threshold = 0.00001
mask = variance_ratio > threshold
X = X.loc[:, mask]

print(f"Shape before: {initial_shape}")
print(f"Shape after: {X.shape}")
print(f"Features removed: {initial_shape[1] - X.shape[1]}")

categorical_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

print(f"Mean of scaled data: {X_scaled.mean().mean():.6f}")
print(f"Std of scaled data: {X_scaled.std().mean():.6f}")
print(f"Scaled data shape: {X_scaled.shape}")

X_array = X_scaled.values
n_samples, n_features = X_array.shape

print(f"Input array shape: {X_array.shape}")

mean_vector = np.mean(X_array, axis=0)
X_centered = X_array - mean_vector
print(f"Data centered - mean: {np.mean(X_centered, axis=0).mean():.10f}")

cov_matrix = (1 / (n_samples - 1)) * np.dot(X_centered.T, X_centered)
print(f"Covariance matrix shape: {cov_matrix.shape}")

eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

print(f"no of eigenvalues: {len(eigenvalues)}")

idx = np.argsort(eigenvalues)[::-1]
eigenvalues_sorted = eigenvalues[idx]
eigenvectors_sorted = eigenvectors[:, idx]

eigenvalues_sorted = np.real(eigenvalues_sorted)
eigenvectors_sorted = np.real(eigenvectors_sorted)

print(f"Eigenvalues sorted (top 10): {eigenvalues_sorted[:10]}")

total_variance = np.sum(eigenvalues_sorted)
explained_variance_ratio = eigenvalues_sorted / total_variance
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

print(f"Total variance: {total_variance:.4f}")
print(f"Explained variance ratio (first 10):\n{explained_variance_ratio[:10]}")

n_components_90 = np.argmax(cumulative_variance_ratio >= 0.90) + 1
n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
n_components_99 = np.argmax(cumulative_variance_ratio >= 0.99) + 1

print(f"\nComponents needed for 90% variance: {n_components_90}")
print(f"Components needed for 95% variance: {n_components_95}")
print(f"Components needed for 99% variance: {n_components_99}")


n_components = n_components_95
components = eigenvectors_sorted[:, :n_components]

print(f"Selected {n_components} components for 95% variance retention")
print(f"Principal components shape: {components.shape}")


X_transformed = np.dot(X_centered, components)

print(f"Transformed data shape: {X_transformed.shape}")
print(f"Variance retained: {cumulative_variance_ratio[n_components-1]:.4f}")

X_pca = pd.DataFrame(X_transformed, columns=[f'PC{i+1}' for i in range(n_components)])

for i in range(min(20, len(explained_variance_ratio))):
    print(f"  PC{i+1}: {explained_variance_ratio[i]:.6f} (Cumulative: {cumulative_variance_ratio[i]:.6f})")

print(f"Dimensionality Reduction Summary:")
print(f"  Original dimensions: {X_scaled.shape[1]}")
print(f"  Reduced dimensions: {X_pca.shape[1]}")
print(f"  Reduction percentage: {((1 - X_pca.shape[1]/X_scaled.shape[1])*100):.2f}%")
print(f"  Total variance retained: {cumulative_variance_ratio[n_components-1]:.4f}")
