In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# Load CSV
df = pd.read_csv("mock_turboid_microprotein_data.csv")

# Select LFC features only
lfc_cols = [col for col in df.columns if col.endswith("_LFC")]
X = df[lfc_cols].values

# Normalize (important!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to torch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)

In [None]:
class MicroProteinDataset(Dataset):
    def __init__(self, X):
        self.X = X

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx]

dataset = MicroProteinDataset(X_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

class Autoencoder(nn.Module):
    def __init__(self, input_dim=5, embedding_dim=2):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, embedding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out, z
