# Exploratory Data Analysis

In this notebook, we will perform some exploratory data analysis on the UniProtKB/Swiss-Prot dataset to understand the distribution of subcellular localizations, sequence lengths, feature correlations, and key physico-chemical properties.

We are using the UniProtKB/Swiss-Prot Release 2025_03 of 18-Jun-2025 for this project; the checksum for the DAT file is `ecfb866a5de8f27497af396735f09b30`. Therefore, you might create a different model if you use a different version of the UniProt data. Just keep that in mind if you want to reproduce the results.


In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import umap
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

In [None]:
FIG_DIR = "results/figures"
CSV_DIR = "results/csv"
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)

In [None]:
df_feat = pd.read_csv("data/processed/features.csv")

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(
    y="localization",
    data=df_feat,
    order=df_feat["localization"].value_counts().index,
    palette="viridis",
    hue="localization",
    legend=False
)
plt.title("Number of Proteins per Compartment")
plt.xlabel("Count")
plt.ylabel("Compartment")
plt.tight_layout()
fig = plt.gcf()
fig.savefig(f"{FIG_DIR}/proteins_per_compartment.png")
print(f"Saved figure to {FIG_DIR}/proteins_per_compartment.png")
plt.close(fig)

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df_feat["sequence_length"], bins=50, kde=True, color="steelblue")
plt.title("Sequence Length Distribution")
plt.xlabel("Sequence Length (AA)")
plt.ylabel("Frequency")
plt.tight_layout()
fig = plt.gcf()
fig.savefig(f"{FIG_DIR}/sequence_length_distribution.png")
print(f"Saved figure to {FIG_DIR}/sequence_length_distribution.png")
plt.close(fig)

In [None]:
# 3. Class balance check
counts = df_feat["localization"].value_counts()
percent = df_feat["localization"].value_counts(normalize=True) * 100
balance_df = pd.DataFrame({"count": counts, "percent": percent.round(2)})
print("Class Balance (count and percentage):")
print(balance_df)

# Save class balance to CSV
balance_df.to_csv(f"{CSV_DIR}/class_balance.csv")
print(f"Saved class balance to {CSV_DIR}/class_balance.csv")

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(
    x="localization",
    y="gravy",
    data=df_feat,
    palette="pastel",
    inner="quartile",
    hue="localization",
)
plt.title("GRAVY Distribution by Compartment")
plt.xticks(rotation=45)
plt.tight_layout()
fig = plt.gcf()
fig.savefig(f"{FIG_DIR}/gravy_distribution.png")
print(f"Saved figure to {FIG_DIR}/gravy_distribution.png")
plt.close(fig)

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(
    x="localization",
    y="pI",
    data=df_feat,
    palette="pastel",
    inner="quartile",
    hue="localization",
)
plt.title("Isoelectric Point (pI) Distribution by Compartment")
plt.xticks(rotation=45)
plt.tight_layout()
fig = plt.gcf()
fig.savefig(f"{FIG_DIR}/pI_distribution.png")
print(f"Saved figure to {FIG_DIR}/pI_distribution.png")
plt.close(fig)

In [None]:
# 3b. Pearson Correlation Heatmap of Top 10 Variable Features
numeric_cols = df_feat.select_dtypes(include=[np.number]).columns.tolist()
top_vars = (
    df_feat[numeric_cols].var().sort_values(ascending=False).head(10).index.tolist()
)
corr_matrix = df_feat[top_vars].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Pearson Correlation Heatmap of Top 10 Variable Features")
plt.tight_layout()
fig = plt.gcf()
fig.savefig(f"{FIG_DIR}/correlation_heatmap.png")
print(f"Saved figure to {FIG_DIR}/correlation_heatmap.png")
plt.close(fig)

In [None]:
try:
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_feat[numeric_cols])
    embedding = reducer.fit_transform(scaled_features)
    method = "UMAP"
except ImportError:
    tsne = TSNE(n_components=2, random_state=42)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_feat[numeric_cols])
    embedding = tsne.fit_transform(scaled_features)
    method = "t-SNE"

In [None]:
emb_df = pd.DataFrame(embedding, columns=["Dim1", "Dim2"])
emb_df["localization"] = df_feat["localization"].values
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x="Dim1", y="Dim2", hue="localization", data=emb_df, legend=False, palette="tab10"
)
plt.title(f"{method} Embedding Colored by True Compartment")
plt.tight_layout()
fig = plt.gcf()
fig.savefig(f"{FIG_DIR}/{method.lower()}_embedding.png")
print(f"Saved embedding figure to {FIG_DIR}/{method.lower()}_embedding.png")
plt.close(fig)

In [None]:
n_clusters = df_feat["localization"].nunique()
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(emb_df[["Dim1", "Dim2"]])
emb_df["cluster"] = clusters

plt.figure(figsize=(8, 6))
sns.scatterplot(
    x="Dim1", y="Dim2", hue="cluster", data=emb_df, palette="tab10", legend="full"
)
plt.title(f"{method} Embedding with KMeans Clusters")
plt.tight_layout()
fig = plt.gcf()
fig.savefig(f"{FIG_DIR}/{method.lower()}_embedding_clusters.png")
print(f"Saved clustering figure to {FIG_DIR}/{method.lower()}_embedding_clusters.png")
plt.close(fig)

In [None]:
ct = pd.crosstab(emb_df["cluster"], emb_df["localization"])
print("Contingency table of clusters vs true compartments:")
print(ct)
ct.to_csv(f"{CSV_DIR}/clusters_vs_localization.csv")
print(f"Saved contingency table to {CSV_DIR}/clusters_vs_localization.csv")