In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import umap
from sklearn.preprocessing import PowerTransformer, StandardScaler, LabelEncoder

In [None]:
df = pd.read_csv("March_2024_AF2_01.txt", sep=r"\s+", header=None)
df.columns = ['Protein_number', 'res_number', 'PB1', 'PB2', 'AA1', 'AA2', 'S2_1', 'S2_2', 'expected_frequency', 'plDDT', 'RSA1', 'RSA2']
df

In [None]:
numerical_cols = ['expected_frequency', 'plDDT', 'RSA1', 'RSA2']
categorical_cols = ['PB1', 'PB2', 'AA1', 'AA2', 'S2_1', 'S2_2']

In [None]:
# removing the columns that will not be included in the clustering
df = df.drop('Protein_number', axis=1)
df = df.drop('res_number', axis=1)
df

## Visualization using UMAP

In [None]:
df_umap = df.copy() # creating a copy for visualization purposes

In [None]:
# Numerical data is transformed using Yeo-Johnson transformation
# to stabilize variance and make data more closely resemble a normal distribution

pt = PowerTransformer(method='yeo-johnson')
df_umap[numerical_cols] = pt.fit_transform(df_umap[numerical_cols])

# Categorical data is transformed using Label Encoding
for cat_col in categorical_cols:
    le = LabelEncoder()
    df_umap[cat_col] = le.fit_transform(df_umap[cat_col])

# sample data and combining transformed categorical and numerical columns
df_umap_sample = df_umap.sample(frac=0.1, random_state=42)
df_umap = df_umap_sample[numerical_cols + categorical_cols].to_numpy()

In [None]:
umap = umap.UMAP(n_components=2, n_neighbors=10, min_dist=0.3)
embedding = umap.fit_transform(df_umap)

plt.figure(figsize=(10, 6))
plt.scatter(embedding[:, 0], embedding[:, 1], s=2, alpha=1.0, c='green')
plt.title('UMAP Visualization (Yeo-Johnson + Label Encoded)')
plt.xlabel('umap1')
plt.ylabel('umap2')
plt.tight_layout()
plt.show()

#### The UMAP 2D visualization reveals a non-random structure, with noticeable local groupings of data points.  
#### This suggests that the dataset contains underlying patterns that may be captured through clustering in the full feature space, even though the visualization was generated from sampled data.

## K-Prototypes clustering

In [None]:
# for k-prototypes categorical data doesn’t need any pre-processing, we only need array with indices of categorical columns
kprot_df = df.copy()

scaler = StandardScaler()
kprot_df[numerical_cols] = scaler.fit_transform(kprot_df[numerical_cols])

categorical_cols_indices = [kprot_df.columns.get_loc(col) for col in categorical_cols]
print(categorical_cols_indices)
kprot_df

In [None]:
from kmodes.kprototypes import KPrototypes

kproto = KPrototypes(n_clusters= 10, init='Cao')
clusters = kproto.fit_predict(kprot_df.to_numpy(), categorical=categorical_cols_indices)

In [None]:
print(kproto.cost_)

In [None]:
# Visualization of the clustering on sampled data
sample_indices = df_umap_sample.index
clusters_sample = np.array(clusters)[sample_indices]

plt.figure(figsize=(10, 6))
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=clusters_sample,
    cmap='tab10',
    s=5,
    alpha=0.9
)
plt.title('UMAP Visualization of Clusters (KPrototypes, k=8)')
plt.xlabel('umap1')
plt.ylabel('umap2')
plt.tight_layout()
plt.show()

In [None]:
import gower
from sklearn.metrics import silhouette_score

X = kprot_df.sample(frac=0.04, random_state=42)
gower_dist = gower.gower_matrix(X.to_numpy())

sample_indices = X.index
clusters_sample = np.array(clusters)[sample_indices]
sil_score = silhouette_score(gower_dist, clusters_sample, metric='precomputed')

print(f"Silhouette score (Gower + KPrototypes): {sil_score:.4f}")

#### The silhouette score suggest weak but present clustering structure, indicating partial separation between clusters.