In [1]:
import os
import torchvision.transforms as transforms,datasets
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from facenet_pytorch import MTCNN
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
import torchvision.models as models
import numpy as np
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset


In [12]:
if torch.cuda.is_available():
    print("✅ GPU доступен!")
    print(f"Имя устройства: {torch.cuda.get_device_name(0)}")
    print(f"Количество устройств: {torch.cuda.device_count()}")
    print(f"Текущий девайс: {torch.cuda.current_device()}")
else:
    print("❌ GPU не найден, используется CPU.")

✅ GPU доступен!
Имя устройства: NVIDIA GeForce RTX 3060
Количество устройств: 1
Текущий девайс: 0


In [13]:
print(torch.__version__)
print(torch.version.cuda)  # Должно быть не None
print(torch.backends.cudnn.version()) 

2.7.0+cu118
11.8
90100


In [14]:

class SimpleAutoencoder(nn.Module):
    def __init__(self, latent_dim=128):
        super(SimpleAutoencoder, self).__init__()
        H = 200
        W = 170
        # Encoder: сжимает изображение
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 32, 4, stride=2, padding=1),  # B,32,H/2,W/2
            nn.ReLU(),
            nn.Conv2d(32, 64, 4, stride=2, padding=1),  # B,64,H/4,W/4
            nn.ReLU(),
            nn.Conv2d(64, 128, 4, stride=2, padding=1), # B,128,H/8,W/8
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(128 * (H//8) * (W//8), latent_dim)  # сжимаем в латентное пространство
        )
        
        # Decoder: восстанавливает изображение
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128 * (H//8) * (W//8)),
            nn.Unflatten(1, (128, H//8, W//8)),
            nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),  # B,64,H/4,W/4
            nn.ReLU(),
            nn.ConvTranspose2d(64, 32, 4, stride=2, output_padding=1),   # B,32,H/2,W/2
            nn.ReLU(),
            nn.ConvTranspose2d(32, 3, 4, stride=2, output_padding=1),    # B,3,H,W
            nn.Sigmoid()  # если данные нормализованы [0,1]
        )
    
    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        out = reconstructed[:, :, :x.size(2), :x.size(3)]

        return out,latent


In [5]:

class VAE(nn.Module):
    def __init__(self, latent_dim=128):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 32, 4, stride=2, padding=1),  # [B, 32, 100, 85]
            nn.ReLU(),
            nn.Conv2d(32, 64, 4, stride=2, padding=1),  # [B, 64, 50, 43]
            nn.ReLU(),
            nn.Conv2d(64, 128, 4, stride=2, padding=1),  # [B, 128, 25, 21]
            nn.ReLU()
        )

        self.flatten = nn.Flatten()
        self.fc_mu = nn.Linear(128 * 25 * 21, latent_dim)
        self.fc_logvar = nn.Linear(128 * 25 * 21, latent_dim)

        # Decoder
        self.decoder_input = nn.Linear(latent_dim, 128 * 25 * 22)
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),  # [B, 64, 50, 44]
            nn.ReLU(),
            nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1, output_padding=(0, 1)),  # [B, 32, 100, 88]
            nn.ReLU(),
            nn.ConvTranspose2d(32, 3, 4, stride=2, padding=1, output_padding=(0, 1)),  # [B, 3, 200, 176]
            nn.Sigmoid()
        )

        # Кроп до точного размера 200x170
        self.crop = lambda x: x[:, :, :, :170]

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        z = self.encoder(x)
        z_flat = self.flatten(z)
        mu = self.fc_mu(z_flat)
        logvar = self.fc_logvar(z_flat)
        latent = self.reparameterize(mu, logvar)

        dec_input = self.decoder_input(latent).view(-1, 128, 25, 22)
        x_recon = self.decoder(dec_input)
        return self.crop(x_recon), mu, logvar

    def loss_function(self, recon_x, x, mu, logvar):
        recon_loss = F.mse_loss(recon_x, x, reduction='mean')
        kld = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
        return recon_loss + kld * 0.001  # Вес KLD можно менять


In [15]:
def show_faces(original, reconstructed, n=5):
    # original = denormalize(original)
    # reconstructed = denormalize(reconstructed)

    fig, axes = plt.subplots(2, n, figsize=(15, 5))
    for i in range(n):
        axes[0, i].imshow(original[i].permute(1, 2, 0).clip(0, 1))
        axes[0, i].axis('off')
        axes[1, i].imshow(reconstructed[i].permute(1, 2, 0).clip(0, 1))
        axes[1, i].axis('off')
    axes[0, 0].set_ylabel('Оригинал', fontsize=14)
    axes[1, 0].set_ylabel('После A.E.', fontsize=14)
    plt.tight_layout()
    plt.show()

In [16]:
image_dir = r'C:\Users\admin\.cache\kagglehub\datasets\jessicali9530\celeba-dataset\versions\2\img_align_celeba\img_align_celeba'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transform = transforms.Compose([
    transforms.Resize((200, 170)),
    transforms.ToTensor()
])

# mtcnn = MTCNN(image_size=160, margin=0, device=device)
autoencoder = SimpleAutoencoder().to(device)
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# Выбираем первые N лиц
N = 100
images = os.listdir(image_dir)[:N]
faces = []

In [17]:
for img_name in tqdm(images):  # images — это список имён файлов
    img_path = os.path.join(image_dir, img_name)
    img = Image.open(img_path).convert('RGB')
    tensor_img = transform(img)
    faces.append(tensor_img)

faces = torch.stack(faces).to(device)


100%|██████████| 100/100 [00:00<00:00, 1128.61it/s]


In [18]:
mass_loss = []
# Тренируем автоэнкодер
for epoch in range(10):
    for img in faces:
        img = img.unsqueeze(0)
        autoencoder.train()
        output,_ = autoencoder(img)
        loss = loss_fn(output, img)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        mass_loss.append(loss.item())
    print(f"Эпоха {epoch+1}, Потери: {np.median(mass_loss):.4f}")

Эпоха 1, Потери: 0.0646
Эпоха 2, Потери: 0.0571
Эпоха 3, Потери: 0.0495
Эпоха 4, Потери: 0.0454
Эпоха 5, Потери: 0.0427
Эпоха 6, Потери: 0.0399
Эпоха 7, Потери: 0.0376
Эпоха 8, Потери: 0.0352
Эпоха 9, Потери: 0.0336
Эпоха 10, Потери: 0.0325


In [26]:
autoencoder.eval()
out,latent = autoencoder(faces)
reconstructed = out.detach().cpu()

show_faces(faces.cpu(), reconstructed)


In [32]:
latent = latent.detach().cpu().numpy()

In [35]:
latent.shape

(100, 128)