Сначала создадим датасет, на котором будут обучаться наши модели

In [2]:
import sys
sys.path.append('../../')

import numpy as np
import pandas as pd
from src.graph_builders import build_distance_graph
from src.features import compute_feature
from src.simulation import simulate_sample
from tqdm import tqdm

In [None]:
# Параметры эксперимента
D = 1.5  # Фиксированный порог расстояния
SAMPLE_SIZES = [25, 100, 500]  # Размеры выборок
N_SAMPLES_PER_CLASS = {
    25: 2500,
    100: 2500,
    500: 100
}

FEATURES = [
    "chromatic_number",  # Хроматическое число
    "clique_number",     # Кликовое число (размер максимальной клики)
    "max_independent_set",  # Размер максимального независимого множества
    "domination_number",    # Число доминирования
    "clique_cover_number"   # Размер минимального кликового покрытия
]

NORMAL_PARAMS = {"mu": 0, "sigma": 1.0}
LAPLACE_PARAMS = {"mu": 0, "beta": np.sqrt(1/2)}

def compute_graph_features(sample: np.ndarray) -> list:
    G = build_distance_graph(sample, D)
    return [compute_feature(G, feature) for feature in FEATURES]



for n in tqdm(SAMPLE_SIZES):
    print(f"Генерация датасета для n = {n}...")
    
    features_data = np.zeros((N_SAMPLES_PER_CLASS[n] * 2, len(FEATURES)))
    labels = np.zeros(N_SAMPLES_PER_CLASS[n] * 2)
    
    # Нормальное распределение (класс 0)
    for i in range(N_SAMPLES_PER_CLASS[n]):
        sample = simulate_sample(n, "normal", NORMAL_PARAMS)
        features_data[i] = compute_graph_features(sample)
    
    # Распределение Лапласа (класс 1)
    for i in range(N_SAMPLES_PER_CLASS[n], N_SAMPLES_PER_CLASS[n] * 2):
        sample = simulate_sample(n, "laplace", LAPLACE_PARAMS)
        features_data[i] = compute_graph_features(sample)
        labels[i] = 1
    
    
    df = pd.DataFrame(features_data, columns=FEATURES)
    df['target'] = labels
    
    filename = f"generated_data/distance_graph_features_n{n}.csv"
    df.to_csv(filename, index=False)
    print(f"Сохранен датасет: {filename} (размер: {df.shape})")

Обучим несколько классификаторов для классификации распределений

In [3]:
# Импортируем все нужное, чтобы учить классификаторы

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix, classification_report
)

import time
import os

In [4]:
def load_and_prepare_data(n_size):
    filename = f"generated_data/distance_graph_features_n{n_size}.csv"
    if not os.path.exists(filename):
        raise FileNotFoundError(f"Файл {filename} не найден! Сначала сгенерируйте данные.")
    
    df = pd.read_csv(filename)
    
    X = df.drop('target', axis=1)
    y = df['target']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

In [5]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test, n_size):
    results = []
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "CatBoost": CatBoostClassifier(iterations=500, learning_rate=0.05, 
                                      depth=6, verbose=False, random_state=42)
    }
    
    for i, (model_name, model) in enumerate(models.items()):
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        # Расчет метрик
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Сохранение результатов
        results.append({
            'Model': model_name,
            'Size': n_size,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1,
        })
    
    return pd.DataFrame(results)

In [7]:
def analyze_feature_importance(models, feature_names, n_size):
    plt.figure(figsize=(12, 8))
    
    for i, (model_name, model) in enumerate(models.items()):
        if model_name == "Logistic Regression":
            importances = np.abs(model.coef_[0])
        elif model_name == "Random Forest":
            importances = model.feature_importances_
        elif model_name == "CatBoost":
            importances = model.get_feature_importance()
        else:
            continue
        
        importances = 100.0 * (importances / importances.max())
        sorted_idx = np.argsort(importances)
        
        plt.subplot(3, 1, i+1)
        plt.barh(range(len(importances)), importances[sorted_idx], align='center')
        plt.yticks(range(len(importances)), [feature_names[i] for i in sorted_idx])
        plt.xlabel('Важность признака (%)')
        plt.title(f'{model_name} - Важность признаков (n={n_size})')
    
    plt.tight_layout()
    plt.savefig(f'feature_importances/feature_importance_n{n_size}.png', dpi=300)
    plt.close()

In [8]:
# Размеры выборок для анализа
# sample_sizes = [25, 100, 500]
sample_sizes = [25, 100]

all_results = []

for n_size in sample_sizes:
    print(f"\n{'='*50}")
    print(f"Анализ для размера выборки n = {n_size}")
    print(f"{'='*50}")
    
    X_train, X_test, y_train, y_test, scaler = load_and_prepare_data(n_size)
    feature_names = ["chromatic_number", "clique_number", 
                    "max_independent_set", "domination_number", 
                    "clique_cover_number"]
    
    size_results = train_and_evaluate_models(X_train, X_test, y_train, y_test, n_size)
    all_results.append(size_results)
    
    print(f"\nРезультаты для n={n_size}:")
    print(size_results)
    
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42).fit(X_train, y_train),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train, y_train),
        "CatBoost": CatBoostClassifier(iterations=500, learning_rate=0.05, 
                                     depth=6, verbose=False, random_state=42).fit(X_train, y_train)
    }
    analyze_feature_importance(models, feature_names, n_size)

final_results = pd.concat(all_results)
print("\nИтоговые результаты по всем моделям и размерам выборок:")
print(final_results)


Анализ для размера выборки n = 25

Результаты для n=25:
                 Model  Size  Accuracy  Precision  Recall        F1
0  Logistic Regression    25     0.759   0.751456   0.774  0.762562
1        Random Forest    25     0.774   0.795259   0.738  0.765560
2             CatBoost    25     0.775   0.796976   0.738  0.766355

Анализ для размера выборки n = 100

Результаты для n=100:
                 Model  Size  Accuracy  Precision  Recall        F1
0  Logistic Regression   100     0.911   0.915152   0.906  0.910553
1        Random Forest   100     0.909   0.914807   0.902  0.908359
2             CatBoost   100     0.907   0.904573   0.910  0.907278

Итоговые результаты по всем моделям и размерам выборок:
                 Model  Size  Accuracy  Precision  Recall        F1
0  Logistic Regression    25     0.759   0.751456   0.774  0.762562
1        Random Forest    25     0.774   0.795259   0.738  0.765560
2             CatBoost    25     0.775   0.796976   0.738  0.766355
0  Logistic