In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
SEED = 2020
import numpy as np
import random
random.seed(SEED)
np.random.seed(SEED)

In [2]:
df = pd.read_csv('houston_clean.csv')
df.sample()

Unnamed: 0,Latitude,Longitude,Year Built,Beds,Baths,buildingSize,lotSize,PostalCode,Price
4467,29.744803,-95.555954,1975.0,3.0,3.0,2070.0,1407.0,77042,2150.0


In [3]:
# Выделяем признаки и целевую переменную
# Выбираем 1000 случайных объектов из датасета
# df = df.sample(n=2500, random_state=42)  # random_state для воспроизводимости

X = df.drop(columns=['Price'])
y = df['Price']
y_log = np.log1p(y)

In [13]:
import numpy as np
import pygad
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Функция для оценки модели
def fitness_function(ga_instance, solution, solution_idx):
    # Приведение параметров к целым числам, где это необходимо
    max_depth = int(solution[0])
    learning_rate = solution[1]
    num_leaves = int(solution[2])
    max_bin = int(solution[3])
    n_estimators = int(solution[4])
    colsample_bytree = solution[5]

    params = {
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'num_leaves': num_leaves,
        'max_bin': max_bin,
        'n_estimators': n_estimators,
        'colsample_bytree': colsample_bytree
    }

    # Модель LightGBM
    model = LGBMRegressor(**params)

    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    mse_scores = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

    mean_mse = np.mean(mse_scores)
    
    # Отрицательное значение для минимизации
    return -mean_mse

# Настройка параметров и их диапазонов
param_ranges = {
    "max_depth": (-1, 120),
    "learning_rate": (0.05, 0.51),
    "num_leaves": (10, 1000),
    "max_bin": (10, 1000),
    "n_estimators": (100, 1100),
    "colsample_bytree": (0.5, 1.0),
}

# Преобразуем диапазоны параметров в список для использования в pygad
gene_space = [
    {'low': param_ranges['max_depth'][0], 'high': param_ranges['max_depth'][1], 'step': 1},
    {'low': param_ranges['learning_rate'][0], 'high': param_ranges['learning_rate'][1]},
    {'low': param_ranges['num_leaves'][0], 'high': param_ranges['num_leaves'][1], 'step': 5},
    {'low': param_ranges['max_bin'][0], 'high': param_ranges['max_bin'][1], 'step': 5},
    {'low': param_ranges['n_estimators'][0], 'high': param_ranges['n_estimators'][1], 'step': 5},
    {'low': param_ranges['colsample_bytree'][0], 'high': param_ranges['colsample_bytree'][1]}
]

# Настройка параметров для pygad
num_generations = 100  # Количество поколений
num_parents_mating = 10  # Количество родителей на поколение
sol_per_pop = 20  # Размер популяции
num_genes = len(param_ranges)

# Создание экземпляра GA с использованием pygad
ga_instance = pygad.GA(
    num_generations=num_generations,
    num_parents_mating=num_parents_mating,
    fitness_func=fitness_function,
    sol_per_pop=sol_per_pop,
    num_genes=num_genes,
    gene_space=gene_space,
    mutation_percent_genes=10,  # Процент мутаций
    crossover_type="single_point",  # Тип кроссинговера
    mutation_type="random",  # Тип мутации
)

# Запуск оптимизации
ga_instance.run()

# Получение результатов
best_solution, best_solution_fitness, _ = ga_instance.best_solution()
print(f"Лучшие параметры: {best_solution}")
print(f"Лучший MSE: {-best_solution_fitness}")  # Печатаем MSE, инвертируя отрицание


If you do not want to mutate any gene, please set mutation_type=None.


Лучшие параметры: [2.70000000e+01 6.39502095e-02 2.50000000e+01 2.40000000e+02
 1.06000000e+03 6.56464509e-01]
Лучший MSE: 152225.89616708414


In [6]:
# финальный от 14/09/24

import random
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict, KFold
from lightgbm import LGBMRegressor
import numpy as np
from scipy.stats import qmc

class EvolutionaryOptimizer:
    def __init__(self, param_ranges):
        """
        Инициализация оптимизатора.

        param_ranges: словарь с диапазонами параметров для оптимизации
        """
        self.param_ranges = param_ranges
        self.evaluated_params = set()  # Для хранения уже оцененных параметров
        self.cache = {}  # Для кеширования результатов

    def initialize_population(self, size, strategy='random'):
        """
        Инициализация популяции индивидуумами в зависимости от выбранной стратегии.

        strategy: 'random' для случайной инициализации, 'latin_hypercube' для латинского гиперкуба, 'sobol' для Sobol последовательности
        """
        if strategy == 'random':
            return self._initialize_random_population(size)
        elif strategy == 'latin_hypercube':
            return self._initialize_latin_hypercube_population(size)
        elif strategy == 'sobol':
            # Проверка и округление до ближайшей степени двойки
            if (size & (size - 1)) != 0:
                size = 2**int(np.ceil(np.log2(size)))
            return self._initialize_sobol_population(size)
        else:
            raise ValueError("Unknown initialization strategy")

    def _initialize_random_population(self, size):
        """
        Инициализация случайной популяции.
        """
        population = []
        while len(population) < size:
            individual = self.generate_individual()
            if tuple(sorted(individual.items())) not in self.evaluated_params:
                population.append(individual)
                self.evaluated_params.add(tuple(sorted(individual.items())))
        return population

    def _initialize_latin_hypercube_population(self, size):
        """
        Инициализация популяции с использованием латинского гиперкуба.
        """
        sampler = qmc.LatinHypercube(d=len(self.param_ranges))
        sample = sampler.random(n=size)
        sample = qmc.scale(sample, [v[0] for v in self.param_ranges.values()], [v[1] for v in self.param_ranges.values()])
        population = [self._sample_to_individual(s) for s in sample]
        return population

    def _initialize_sobol_population(self, size):
        """
        Инициализация популяции с использованием последовательности Sobol.
        """
        sampler = qmc.Sobol(d=len(self.param_ranges))
        sample = sampler.random(n=size)
        sample = qmc.scale(sample, [v[0] for v in self.param_ranges.values()], [v[1] for v in self.param_ranges.values()])
        population = [self._sample_to_individual(s) for s in sample]
        return population

    def _sample_to_individual(self, sample):
        """
        Преобразование выборки из пространства гиперпараметров в индивидуума.
        """
        individual = {}
        for i, param in enumerate(self.param_ranges):
            value = sample[i]
            if isinstance(self.param_ranges[param][0], int):
                individual[param] = int(value)
            else:
                individual[param] = value
        return individual

    def generate_individual(self):
        """
        Генерация случайного индивидуума на основе заданных диапазонов параметров.
        """
        return {param: (random.randint(self.param_ranges[param][0], self.param_ranges[param][1]) if isinstance(self.param_ranges[param][0], int)
                else random.uniform(self.param_ranges[param][0], self.param_ranges[param][1]))
                for param in self.param_ranges}

    def encode(self, individual):
        """
        Бинарное кодирование индивидуума.
        """
        binary_individual = {}
        for param, value in individual.items():
            if isinstance(value, int):
                binary_individual[param] = bin(value)[2:].zfill(10)
            else:
                binary_value = bin(int(value * (2**10 - 1)))[2:].zfill(10)
                binary_individual[param] = binary_value
        return binary_individual

    def decode(self, binary_individual):
        """
        Декодирование бинарного индивидуума в обычный вид.
        """
        individual = {}
        for param, binary_value in binary_individual.items():
            if isinstance(self.param_ranges[param][0], int):
                individual[param] = int(binary_value, 2)
            else:
                individual[param] = int(binary_value, 2) / (2**10 - 1)
        return individual

    def evaluate_params(self, params, X, y, seed, n_splits):
        """
        Оценка производительности модели с заданными параметрами.
        """
        key = tuple(sorted(params.items()))
        if key in self.cache:
            return self.cache[key]

        model = LGBMRegressor(**params, random_state=seed)
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        y_pred = cross_val_predict(model, X, y, cv=kf, n_jobs=-1)
        mse = mean_squared_error(y, y_pred)  

        self.cache[key] = mse
        return mse

    def select_best_individuals(self, population, scores, num_best, method):
        """
        Выбор лучших индивидуумов в популяции на основе их оценок.
        """
        if method == 'proportional':
            return self._proportional_selection(population, scores, num_best)
        elif method == 'tournament':
            return self._tournament_selection(population, scores, num_best)
        elif method == 'elite':
            return self._elite_selection(population, scores, num_best)
        else:
            raise ValueError("Unknown selection method")

    def _proportional_selection(self, population, scores, num_best):
        """
        Пропорциональный отбор.
        """
        total_score = sum(scores)
        selection_probs = [score / total_score for score in scores]
        selected_indices = np.random.choice(range(len(population)), size=num_best, p=selection_probs)
        return [population[i] for i in selected_indices]

    def _tournament_selection(self, population, scores, num_best, tournament_size=3):
        """
        Турнирный отбор.
        """
        selected_individuals = []
        for _ in range(num_best):
            tournament_indices = np.random.choice(range(len(population)), size=tournament_size, replace=False)
            tournament_scores = [scores[i] for i in tournament_indices]
            winner_index = tournament_indices[np.argmin(tournament_scores)]
            selected_individuals.append(population[winner_index])
        return selected_individuals

    def _elite_selection(self, population, scores, num_best):
        """
        Элитный отбор.
        """
        best_indices = np.argsort(scores)[:num_best]
        return [population[i] for i in best_indices]

    def crossover(self, parent1, parent2):
        """
        Кроссинговер двух родительских индивидуумов для создания нового потомка.
        """
        return {param: (parent1[param] if random.random() > 0.5 else parent2[param]) for param in self.param_ranges}

    def mutate(self, individual, mutation_rate, mutation_ranges, mutation_percent_genes, mutation_by_replacement, mutation_strategy='uniform', encoding='real'):
        """
        Мутация индивидуума.
        mutation_percent_genes: процент генов, которые будут мутированы.
        mutation_by_replacement: если True, мутируемые гены будут полностью заменены новыми значениями.
        mutation_strategy: 
            'uniform' - равномерное распределение
            'gaussian' - нормальное распределение
        """
        if encoding == 'binary':
            individual = self.decode(individual)

        num_genes_to_mutate = int(len(self.param_ranges) * mutation_percent_genes)  # Определяем количество генов для мутации
        genes_to_mutate = random.sample(list(self.param_ranges.keys()), num_genes_to_mutate)  # Случайно выбираем гены для мутации

        for param in genes_to_mutate:
            if random.random() < mutation_rate:
                if mutation_by_replacement:
                    # Полная замена гена
                    individual[param] = random.randint(self.param_ranges[param][0], self.param_ranges[param][1]) if isinstance(self.param_ranges[param][0], int) else random.uniform(self.param_ranges[param][0], self.param_ranges[param][1])
                else:
                    # Мутация без полной замены
                    if mutation_strategy == 'uniform':
                        individual[param] = random.uniform(self.param_ranges[param][0], self.param_ranges[param][1]) if not isinstance(self.param_ranges[param][0], int) else random.randint(self.param_ranges[param][0], self.param_ranges[param][1])
                    elif mutation_strategy == 'gaussian':
                        individual[param] = np.clip(individual[param] + np.random.normal(0, mutation_ranges[param]), self.param_ranges[param][0], self.param_ranges[param][1])
                        if isinstance(self.param_ranges[param][0], int):
                            individual[param] = int(individual[param])

        if encoding == 'binary':
            individual = self.encode(individual)

        return individual

    def panmixia(self, population):
        """
        Панмектический выбор родителей.
        """
        return random.sample(population, 2)

    def outbreeding(self, population):
        """
        Аутбридинговый выбор родителей на основе максимального различия генотипов.
        """
        max_diff = -1
        parent1, parent2 = None, None
        for i in range(len(population)):
            for j in range(i + 1, len(population)):
                diff = sum(a != b for a, b in zip(population[i].values(), population[j].values()))
                if diff > max_diff:
                    max_diff = diff
                    parent1, parent2 = population[i], population[j]
        return parent1, parent2

    def optimize(self, X, y, settings):
        """
        Основной метод оптимизации.
        """
        population_size = settings['population_size']
        generations = settings['generations']
        elite_fraction = settings['elite_fraction']
        mutation_rate = settings['mutation_rate']
        mutation_range_factor = settings['mutation_range_factor']
        mutation_percent_genes = settings.get('mutation_percent_genes', 0.2)  # Добавили как гиперпараметр
        mutation_by_replacement = settings.get('mutation_by_replacement', True)  # Добавили как гиперпараметр
        max_no_improvement = settings['max_no_improvement']
        n_splits = settings['n_splits']
        seed = settings['seed']
        mutation_strategy = settings['mutation_strategy']
        init_strategy = settings['init_strategy']
        encoding = settings.get('encoding', 'real')  # 'real' или 'binary'
        mating_strategy = settings.get('mating_strategy', 'panmixia')  # 'panmixia' или 'outbreeding'
        selection_method = settings.get('selection_method', 'elite')  # Метод отбора ('proportional', 'tournament', 'elite')

        population = self.initialize_population(population_size, strategy=init_strategy)
        if encoding == 'binary':
            population = [self.encode(individual) for individual in population]

        best_mse = float('inf')
        best_params = None
        no_improvement_count = 0
        elite_size = int(population_size * elite_fraction)
        mutation_ranges = {param: (self.param_ranges[param][1] - self.param_ranges[param][0]) * mutation_range_factor for param in self.param_ranges}

        for generation in range(generations):
            with ThreadPoolExecutor() as executor:
                if encoding == 'binary':
                    decoded_population = [self.decode(individual) for individual in population]
                else:
                    decoded_population = population
                scores = list(executor.map(lambda ind: self.evaluate_params(ind, X, y, seed, n_splits), decoded_population))

            best_individuals = self.select_best_individuals(population, scores, population_size // 2, selection_method)

            min_score = min(scores)
            if min_score < best_mse:
                best_mse = min_score
                best_params = decoded_population[scores.index(min_score)]
                no_improvement_count = 0
            else:
                no_improvement_count += 1

            if no_improvement_count >= max_no_improvement:
                print(f'Алгоритм остановился после {generation+1} поколений из-за отсутствия улучшений')
                break

            # Добавление случайных новых индивидуумов
            new_population = best_individuals[:elite_size]
            while len(new_population) < population_size:
                if mating_strategy == 'panmixia':
                    parent1, parent2 = self.panmixia(best_individuals)
                elif mating_strategy == 'outbreeding':
                    parent1, parent2 = self.outbreeding(best_individuals)
                else:
                    raise ValueError("Unknown mating strategy")
                child = self.crossover(parent1, parent2)
                child = self.mutate(child, mutation_rate, mutation_ranges, mutation_percent_genes, mutation_by_replacement, mutation_strategy, encoding)
                if tuple(sorted(child.items())) not in self.evaluated_params:
                    new_population.append(child)
                    self.evaluated_params.add(tuple(sorted(child.items())))

            # Добавление случайных индивидуумов
            while len(new_population) < population_size:
                individual = self.generate_individual()
                if encoding == 'binary':
                    individual = self.encode(individual)
                if tuple(sorted(individual.items())) not in self.evaluated_params:
                    new_population.append(individual)
                    self.evaluated_params.add(tuple(sorted(individual.items())))

            # Динамическое изменение скорости мутации
            mutation_rate = max(0.1, mutation_rate * 0.95)

            population = new_population
            print(f'Generation {generation+1}, Best MSE: {best_mse}, Best Params: {best_params}, No Improvement Count: {no_improvement_count}')


        return best_params, best_mse

# Пример использования
param_ranges = {
    'n_estimators': (50, 2000),
    'max_depth': (1, 100),
    'num_leaves': (2, 1000),
    'learning_rate': (0.01, 0.3),
    'max_bin': (10, 1000),
    'colsample_bytree': (0.1, 1.0)
}


settings = {
    'population_size': 25,  # Размер популяции
    'generations': 50,  # Количество поколений
    'elite_fraction': 0.15,  # Доля элиты
    'mutation_rate': 0.015,  # Скорость мутации
    'mutation_percent_genes': 0.2,  # 20% генов подвергаются мутации
    'mutation_by_replacement': True,  # Полная замена генов
    'mutation_range_factor': 0.3,  # Фактор диапазона мутаций
    'max_no_improvement':10,  # Максимальное количество поколений без улучшений
    'n_splits': 3,  # Количество разбиений в кросс-валидации
    'seed': 42,  # Начальное значение для генератора случайных чисел
    'mutation_strategy': 'gaussian',  # Стратегия мутации gaussian/uniform
    'init_strategy': 'latin_hypercube',  # Стратегия инициализации начальной популяции ('random', 'latin_hypercube', 'sobol')
    'encoding': 'real',  # Способ кодирования: 'real' или 'binary'
    'mating_strategy': 'panmixia',  # Стратегия спаривания: 'panmixia' или 'outbreeding'
    'selection_method': 'elite'  # Метод отбора ('proportional', 'tournament', 'elite')
}


optimizer = EvolutionaryOptimizer(param_ranges)
best_params, best_mse = optimizer.optimize(X, y, settings)
print(f'Лучшие параметры: {best_params} с MSE: {best_mse}')

Generation 1, Best MSE: 159821.65974883208, Best Params: {'n_estimators': 876, 'max_depth': 93, 'num_leaves': 18, 'learning_rate': 0.14367375155224327, 'max_bin': 347, 'colsample_bytree': 0.7944537712230322}, No Improvement Count: 0
Generation 2, Best MSE: 156723.057918205, Best Params: {'n_estimators': 876, 'max_depth': 47, 'num_leaves': 18, 'learning_rate': 0.14367375155224327, 'max_bin': 477, 'colsample_bytree': 0.44126311581252464}, No Improvement Count: 0
Generation 3, Best MSE: 153831.57812231092, Best Params: {'n_estimators': 876, 'max_depth': 11, 'num_leaves': 18, 'learning_rate': 0.13576527617377898, 'max_bin': 163, 'colsample_bytree': 0.580924670412316}, No Improvement Count: 0
Generation 4, Best MSE: 153831.57812231092, Best Params: {'n_estimators': 876, 'max_depth': 11, 'num_leaves': 18, 'learning_rate': 0.13576527617377898, 'max_bin': 163, 'colsample_bytree': 0.580924670412316}, No Improvement Count: 1
Generation 5, Best MSE: 151378.08600539062, Best Params: {'n_estimators