In [None]:
import random
import time
import numpy as np
import pandas as pd
from deap import base, creator, tools, algorithms
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from tqdm import tqdm
import gc
from pyDOE import lhs
import time

# Загрузка данных
data = pd.read_csv('houston_short_300.csv')
# data = data.sample(n=300, random_state=42)
# data.to_csv('houston_short_300.csv', index=False)
X = data[['Latitude', 'Longitude', 'Year Built', 'Beds', 'Baths', 'buildingSize', 'lotSize', 'PostalCode']]
y = data['Price']

In [None]:
import numpy as np
import pygad
import time
import pandas as pd
import random
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from tqdm import tqdm
import gc

# Функция ограничения значений после мутации
def check_bounds(individual, param_ranges):
    for i in range(len(individual)):
        lower_bound = param_ranges[i][0]
        upper_bound = param_ranges[i][1]
        individual[i] = np.clip(individual[i], lower_bound, upper_bound)
    return individual

# Функция оценки модели LGBM с использованием MSE
def lgbm_function(solution, solution_idx):
    params = {
        'n_estimators': int(solution[0]),
        'max_depth': int(solution[1]),
        'num_leaves': int(solution[2]),
        'learning_rate': solution[3],
        'max_bin': int(solution[4]),
        'colsample_bytree': solution[5]
    }
    model = lgb.LGBMRegressor(**params)
    scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_squared_error')
    mse = -np.mean(scores)
    return -mse  # PyGAD максимизирует, поэтому возвращаем отрицательное значение

# Генерация случайных гиперпараметров
def generate_random_hyperparameters():
    hyperparameters = {
        "sol_per_pop": random.randint(100, 500),
        "num_parents_mating": random.randint(2, 100),
        "mutation_percent_genes": random.uniform(1, 20),
        "crossover_type": random.choice(["single_point", "two_points", "uniform"]),
        "mutation_type": random.choice(["random", "adaptive"]),
        "mutation_by_replacement": random.choice([True, False]),
        "random_mutation_min_val": random.uniform(-0.1, 0.0),
        "random_mutation_max_val": random.uniform(0.0, 0.1)
    }
    return hyperparameters

# Обратный вызов для каждой итерации
def on_generation(ga_instance):
    current_best_fitness = ga_instance.best_solution()[1]
    print(f"Поколение: {ga_instance.generations_completed}, Лучшая пригодность: {current_best_fitness}")

# Функция для запуска PyGAD и оптимизации
def run_pygad_optimizer(param_ranges, hyperparams):
    # Начальная популяция
    initial_population = np.random.uniform([param[0] for param in param_ranges],
                                           [param[1] for param in param_ranges],
                                           (hyperparams['sol_per_pop'], len(param_ranges)))
    
    # Проверка на границы
    initial_population = [check_bounds(ind, param_ranges) for ind in initial_population]

    ga_instance = pygad.GA(
        num_generations=50,
        sol_per_pop=hyperparams['sol_per_pop'],
        num_parents_mating=hyperparams['num_parents_mating'],
        fitness_func=lgbm_function,
        initial_population=initial_population,
        mutation_percent_genes=hyperparams['mutation_percent_genes'],
        crossover_type=hyperparams['crossover_type'],
        mutation_type=hyperparams['mutation_type'],
        mutation_by_replacement=hyperparams['mutation_by_replacement'],
        random_mutation_min_val=hyperparams['random_mutation_min_val'],
        random_mutation_max_val=hyperparams['random_mutation_max_val'],
        on_generation=on_generation
    )

    # Запуск оптимизации
    ga_instance.run()
    return ga_instance.best_solution()

# Внешние и внутренние циклы
num_outer_runs = 20  # Количество внешних прогонов
num_inner_runs = 5   # Количество внутренних прогонов

# Параметры LightGBM
param_ranges = [
    (50, 2000),        # n_estimators
    (1, 100),          # max_depth
    (2, 1000),         # num_leaves
    (0.01, 0.3),       # learning_rate
    (10, 1000),        # max_bin
    (0.5, 1.0)         # colsample_bytree
]

summary_results = []

for outer_run in tqdm(range(num_outer_runs), desc="Внешние прогоны"):
    print(f"\n=== Внешний прогон {outer_run + 1} ===\n")

    # Генерация случайных гиперпараметров
    hyperparams = generate_random_hyperparameters()
    results = []

    for inner_run in range(num_inner_runs):
        best_solution, best_fitness = run_pygad_optimizer(param_ranges, hyperparams)

        run_data = {
            'Best Fitness': best_fitness,
            'Best Solution': best_solution
        }
        results.append(run_data)

    # Обработка результатов внутренних прогонов
    df_results = pd.DataFrame(results)
    median_fitness = df_results['Best Fitness'].median()
    best_solution_median = df_results['Best Solution'].apply(np.median).tolist()

    summary_row = {
        'Медиана MSE': median_fitness,
        'Лучшая конфигурация (медиана)': best_solution_median,
        'Population Size': hyperparams["sol_per_pop"],
        'Mutation Percent Genes': hyperparams["mutation_percent_genes"],
        'Crossover Type': hyperparams["crossover_type"],
        'Mutation Type': hyperparams["mutation_type"]
    }
    summary_results.append(summary_row)

    # Сохранение данных в CSV после каждого внешнего прогона
    df_summary = pd.DataFrame([summary_row])
    df_summary.to_csv('pygad_results.csv', mode='a', header=not pd.io.common.file_exists('pygad_results.csv'), index=False)

    gc.collect()

# Итоговая таблица с результатами всех внешних прогонов
df_summary = pd.DataFrame(summary_results)
print("\nИтоговая таблица с медианными значениями и гиперпараметрами:")
print(df_summary)
