In [None]:
# Создайте кастомный класс датасета для работы с CSV файлами:
# - Загрузка данных из файла
# - Предобработка (нормализация, кодирование категорий)
# - Поддержка различных форматов данных (категориальные, числовые, бинарные и т.д.)

In [53]:
import torch
import pandas as pd
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

class CSVDataset(Dataset):
    
    def __init__(self, file_path, target_column, 
                 numeric_features=None, 
                 categorical_features=None,
                 binary_features=None):
        # Загрузка данных
        self.df = pd.read_csv(file_path)
        
        # Проверка наличия колонок
        self._validate_columns(target_column, numeric_features, 
                             categorical_features, binary_features)
        
        # Сохранение параметров
        self.target_column = target_column
        self.numeric_features = numeric_features or []
        self.categorical_features = categorical_features or []
        self.binary_features = binary_features or []
        
        # Предобработка данных
        self.preprocessor = self._create_preprocessor()
        self.X = self._preprocess_features()
        self.y = self._get_target()
        
    def _validate_columns(self, target_column, numeric_features, 
                         categorical_features, binary_features):
        #Проверка колонок
        all_columns = (numeric_features or []) + \
                     (categorical_features or []) + \
                     (binary_features or []) + \
                     [target_column]
        
        missing = set(all_columns) - set(self.df.columns)
        if missing:
            raise ValueError(f"Отсутствуют колонки: {missing}")
    
    def _create_preprocessor(self):
        transformers = []
        
        if self.numeric_features:
            transformers.append(('num', StandardScaler(), self.numeric_features))
        if self.categorical_features:
            transformers.append(('cat', OneHotEncoder(sparse=False), self.categorical_features))
        if self.binary_features:
            transformers.append(('binary', 'passthrough', self.binary_features))
            
        return ColumnTransformer(transformers, remainder='drop')
    
    def _preprocess_features(self):
        #Применение предобработки
        X_processed = self.preprocessor.fit_transform(
            self.df.drop(columns=[self.target_column])
        )
        return torch.tensor(X_processed, dtype=torch.float32)
    
    def _get_target(self):
        #Преобразование целевой в тензор
        y = self.df[self.target_column].values
        return torch.tensor(y, dtype=torch.float32).view(-1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
# Найдите csv датасеты для регрессии и бинарной классификации и, 
# применяя наработки из предыдущей части задания, обучите линейную и логистическую регрессию

In [54]:
import torch
from torch.utils.data import DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
from models import LinearRegression, LogisticRegression
from utils import log_epoch
from sklearn.datasets import load_diabetes, load_breast_cancer
import pandas as pd
import os
import numpy as np

# Класс для нормализации целевой переменной
class TargetScaler:
    def __init__(self):
        self.scaler = StandardScaler()
        self.fitted = False
        
    def fit_transform(self, y):
        if y.ndim == 1:
            y = y.reshape(-1, 1)
        scaled = self.scaler.fit_transform(y)
        self.fitted = True
        return scaled.flatten()
    
    def inverse_transform(self, y):
        if y.ndim == 1:
            y = y.reshape(-1, 1)
        return self.scaler.inverse_transform(y).flatten()

def save_datasets():
    os.makedirs('data', exist_ok=True)

    # Регрессия
    diabetes = load_diabetes()
    df_d = pd.DataFrame(diabetes['data'], columns=diabetes['feature_names'])
    df_d['target'] = diabetes['target']
    df_d.to_csv('data/diabetes.csv', index=False)

    # Классификация
    cancer = load_breast_cancer()
    df_c = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])
    df_c['target'] = cancer['target']
    df_c.to_csv('data/breast_cancer.csv', index=False)

def evaluate_regression(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            output = model(X_batch)
            loss = criterion(output, y_batch)
            total_loss += loss.item()
    return total_loss / len(test_loader)


def evaluate_classification(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            total += y_batch.size(0)
            correct += (predicted == y_batch).sum().item()
    return 100 * correct / total


def train_model(model, train_loader, criterion, optimizer, epochs=100):
    model.train()
    for epoch in range(1, epochs + 1):
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        if epoch % 10 == 0 or epoch == epochs:
            log_epoch(epoch, avg_loss)

def run_regression():
    csv_path = 'data/diabetes.csv'
    numeric_features = [
        'age', 'sex', 'bmi', 'bp', 
        's1', 's2', 's3', 's4', 's5', 's6'
    ]
    
    dataset = CSVDataset(
        file_path=csv_path,
        target_column='target',
        numeric_features=numeric_features
    )
    
    # Нормализация таргета
    target_scaler = TargetScaler()
    y = dataset.df['target'].values
    y_scaled = target_scaler.fit_transform(y)
    dataset.y = torch.tensor(y_scaled, dtype=torch.float32).view(-1, 1)
    
    # Разделение данных
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model = LinearRegression(in_features=dataset.X.shape[1])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    
    print("Обучение линейной регрессии:")
    train_model(model, train_loader, criterion, optimizer, epochs=100)
    
    test_mse = evaluate_regression(model, test_loader, criterion)
    print(f"\nОценка модели на тестовых данных:")
    print(f"Test MSE: {test_mse:.4f}")
    print(f"Test RMSE: {np.sqrt(test_mse):.4f}")

def run_classification():
    csv_path = 'data/breast_cancer.csv'
    numeric_features=['mean radius', 'mean texture', 'mean perimeter', 'mean area','mean smoothness','mean compactness',
                          'mean concavity','mean concave points','mean symmetry','mean fractal dimension',
                          'radius error','texture error','perimeter error','area error','smoothness error',
                          'compactness error','concavity error','concave points error','symmetry error',
                          'fractal dimension error','worst radius','worst texture','worst perimeter',
                          'worst area','worst smoothness','worst compactness','worst concavity','worst concave points',
                          'worst symmetry','worst fractal dimension']
    
    dataset = CSVDataset(
        file_path=csv_path,
        target_column='target',
        numeric_features=numeric_features
    )
    
    # Разделение данных
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model = LogisticRegression(in_features=dataset.X.shape[1], n_classes=1)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    print("\nОбучение логистической регрессии:")
    train_model(model, train_loader, criterion, optimizer, epochs=100)
    
    test_acc = evaluate_classification(model, test_loader)
    print(f"\nОценка модели на тестовых данных:")
    print(f"Test Accuracy: {test_acc:.2f}%")

if __name__ == '__main__':
    save_datasets()
    run_regression()
    run_classification()

Обучение линейной регрессии:
Epoch 10: loss=0.5029
Epoch 20: loss=0.5160
Epoch 30: loss=0.4647
Epoch 40: loss=0.4962
Epoch 50: loss=0.6389
Epoch 60: loss=0.5437
Epoch 70: loss=0.4576
Epoch 80: loss=0.4754
Epoch 90: loss=0.6346
Epoch 100: loss=0.4894

Оценка модели на тестовых данных:
Test MSE: 0.4850
Test RMSE: 0.6964

Обучение логистической регрессии:
Epoch 10: loss=0.0686
Epoch 20: loss=0.0516
Epoch 30: loss=0.0412
Epoch 40: loss=0.0388
Epoch 50: loss=0.0337
Epoch 60: loss=0.0336
Epoch 70: loss=0.0304
Epoch 80: loss=0.0271
Epoch 90: loss=0.0294
Epoch 100: loss=0.0275

Оценка модели на тестовых данных:
Test Accuracy: 95.61%
