In [1]:
import random

import numpy as np
import pandas as pd
from numba import boolean, njit

In [2]:
# Load data
data = pd.read_csv('data/family_data.csv')
submission = pd.read_csv('data/sample_submission.csv')

matrix = data[[f'choice_{i}' for i in range(10)]].to_numpy()
family_size = data['n_people'].to_numpy()

N_FAMILIES = len(family_size)
N_DAYS = 100
MAX_OCCUPANCY = 300
MIN_OCCUPANCY = 125

In [3]:
# Precompute penalties as a NumPy array
penalties_array = np.zeros((family_size.max() + 1, 11), dtype=np.float64)
for n in range(family_size.max() + 1):
    penalties_array[n] = [
        0,
        50,
        50 + 9 * n,
        100 + 9 * n,
        200 + 9 * n,
        200 + 18 * n,
        300 + 18 * n,
        300 + 36 * n,
        400 + 36 * n,
        500 + 36 * n + 199 * n,
        500 + 36 * n + 398 * n
    ]

In [4]:
# Convert matrix into choice ranking lookup:
choice_rank = -np.ones((N_FAMILIES, N_DAYS), dtype=np.int8)
for i in range(matrix.shape[0]):
    for rank, day in enumerate(matrix[i]):
        choice_rank[i, day - 1] = rank  # days are 1-based

In [5]:
@njit
def cost_function(prediction, family_size, choice_rank, penalties_array):
    daily_occupancy = np.zeros(N_DAYS, dtype=np.int32)
    penalty = 0.0

    for i in range(prediction.shape[0]):
        d = prediction[i] - 1  # adjust for 0-based index
        n = family_size[i]
        daily_occupancy[d] += n

        rank = choice_rank[i, d]
        if rank == -1:
            penalty += penalties_array[n, 10]
        else:
            penalty += penalties_array[n, rank]

    # Soft constraints
    for v in daily_occupancy:
        if v < MIN_OCCUPANCY or v > MAX_OCCUPANCY:
            penalty += 1e8

    # Accounting cost
    acc = max(0, ((daily_occupancy[N_DAYS-1] - 125.0) / 400.0) * (daily_occupancy[N_DAYS-1]**0.5))
    yesterday = daily_occupancy[N_DAYS-1]

    for i in range(N_DAYS-2, -1, -1):
        today = daily_occupancy[i]
        diff = abs(today - yesterday)
        acc += max(0, ((today - 125.0) / 400.0) * (today ** (0.5 + diff / 50.0)))
        yesterday = today

    penalty += acc
    return penalty

In [6]:
class Chromosome:
    def __init__(self, num_days, num_families):
        self.num_days = num_days
        self.num_families = num_families
        self.assigned_days = np.zeros(num_families, dtype=np.int32)
        self.daily_attendance = np.zeros(num_days, dtype=np.int32)

    def copy(self):
        new_chromo = Chromosome(self.num_days, self.num_families)
        new_chromo.assigned_days = self.assigned_days.copy()
        new_chromo.daily_attendance = self.daily_attendance.copy()
        return new_chromo

    def is_swap_valid(self, family_idx, new_day, family_size_arr) -> boolean:
        current_day = self.assigned_days[family_idx]
        family_size = family_size_arr[family_idx]

        reduced = self.daily_attendance[current_day - 1] - family_size
        increased = self.daily_attendance[new_day - 1] + family_size

        return (reduced >= MIN_OCCUPANCY and reduced <= MAX_OCCUPANCY and
                increased >= MIN_OCCUPANCY and increased <= MAX_OCCUPANCY)

    def update_attendance(self, family_idx, old_day, new_day, family_size_arr):
        self.daily_attendance[old_day - 1] -= family_size_arr[family_idx]
        self.daily_attendance[new_day - 1] += family_size_arr[family_idx]

In [7]:
cols = [f'choice_{i}' for i in range(10)]
choice_dict = data[cols].T.to_dict()
choice_dict_num = [{vv:i for i, vv in enumerate(di.values())} for di in choice_dict.values()]
family_size_dict = data[['n_people']].to_dict()['n_people']
family_size_ls = list(family_size_dict.values())

def initialize_chromosome() -> Chromosome:
    people_in_day = {day:0 for day in range(1, N_DAYS + 1)}
    chromosome = np.zeros(len(family_size_dict), dtype=np.int32)
    for i in range(len(chromosome)):
        items = list(choice_dict_num[i].keys()).copy()
        while True:
            new_number = random.randint(1, N_DAYS)
            if new_number not in items:
                items += [new_number]
                break

        weights = penalties_array[family_size_dict[i]]

        inverse_weights = []
        for j in range(len(weights)):
            c = 50
            if people_in_day[items[j]] in range(125, 301):
                c += (people_in_day[items[j]] - 125) * 1000
            elif people_in_day[items[j]] > 300:
                c = 1000000000
            inverse_weights.append(1 / (weights[j] + c))

        new_items = []
        new_inverse_weights = []
        for j in range(len(items)):
            if people_in_day[items[j]] + family_size_ls[i] <= 300:
                new_items.append(items[j])
                new_inverse_weights.append(inverse_weights[j])

        chromosome[i] = random.choices(new_items, new_inverse_weights, k=1)[0]
        people_in_day[chromosome[i]] += family_size_ls[i]

    chromosome_cls = Chromosome(N_DAYS, N_FAMILIES)
    chromosome_cls.assigned_days = chromosome

    for key, value in people_in_day.items():
        chromosome_cls.daily_attendance[key - 1] = value

    return chromosome_cls

In [8]:
def test_initialization(n_runs: int = 100):
    valid_count = 0
    costs = []

    for _ in range(n_runs):
        chromosome = initialize_chromosome()
        cost = cost_function(chromosome.assigned_days, family_size, choice_rank, penalties_array)
        costs.append(cost)

        # Check validity
        if all(MIN_OCCUPANCY <= count <= MAX_OCCUPANCY for count in chromosome.daily_attendance):
            valid_count += 1

    valid_rate = valid_count / n_runs * 100
    mean_cost = np.mean(costs)
    median_cost = np.median(costs)

    print(f"Valid Solution Rate: {valid_rate:.2f}%")
    print(f"Mean Cost: {mean_cost:.2f}")
    print(f"Median Cost: {median_cost:.2f}")

In [9]:
# test_initialization(100)

In [10]:
def selection(population, selection_size, tournament_size):
    pop_size = len(population)
    costs = np.array([cost_function(ind.assigned_days, family_size, choice_rank, penalties_array) for ind in population])
    selected = []

    for _ in range(selection_size):
        indices = np.random.choice(pop_size, tournament_size, replace=False)
        best_idx = indices[np.argmin(costs[indices])]
        selected.append(population[best_idx])

    return selected

In [11]:
def perform_swap(child1, child2, family_idx, day1, day2):
    child1.update_attendance(family_idx, day1, day2, family_size)
    child2.update_attendance(family_idx, day2, day1, family_size)
    child1.assigned_days[family_idx] = day2
    child2.assigned_days[family_idx] = day1

In [12]:
def crossover(parent1, parent2, p=1.0, allow_single_swap=False, random_order=False):
    child1 = parent1.copy()
    child2 = parent2.copy()

    indices = np.arange(len(parent1.assigned_days))
    if random_order:
        np.random.shuffle(indices)

    for family_idx in indices:
        day1 = parent1.assigned_days[family_idx]
        day2 = parent2.assigned_days[family_idx]

        valid1 = child1.is_swap_valid(family_idx, day2, family_size)
        valid2 = child2.is_swap_valid(family_idx, day1, family_size)

        if valid1 and valid2 and np.random.rand() < p:
            perform_swap(child1, child2, family_idx, day1, day2)
        elif allow_single_swap:
            if valid1 and np.random.rand() < p:
                child1.update_attendance(family_idx, day1, day2, family_size)
                child1.assigned_days[family_idx] = day2
            elif valid2 and np.random.rand() < p:
                child2.update_attendance(family_idx, day2, day1, family_size)
                child2.assigned_days[family_idx] = day1

    return child1, child2

In [13]:
def mutation(chromosome, mutation_rate=0.1):
    if np.random.rand() >= mutation_rate:
        return

    family_idx = np.random.randint(len(chromosome.assigned_days))
    current_day = chromosome.assigned_days[family_idx]

    # Random order of days (excluding current)
    possible_days = np.delete(np.arange(1, N_DAYS + 1), current_day - 1)
    np.random.shuffle(possible_days)

    for new_day in possible_days:
        if chromosome.is_swap_valid(family_idx, new_day, family_size):
            chromosome.update_attendance(family_idx, current_day, new_day, family_size)
            chromosome.assigned_days[family_idx] = new_day
            break

In [36]:
def mutation1(chromosome, mutation_rate=0.1):
    if np.random.rand() >= mutation_rate:
        return

    family_idx = np.random.randint(len(chromosome.assigned_days))
    current_day = chromosome.assigned_days[family_idx]

    # Get the family's preference mapping {day: rank}
    preference_ranks = choice_dict_num[family_idx]

    # Rank of the current assigned day (default to worst if not in prefs)
    current_rank = preference_ranks.get(current_day, 10)

    # Shuffle possible days excluding current
    possible_days = np.delete(np.arange(1, N_DAYS + 1), current_day - 1)
    np.random.shuffle(possible_days)

    for new_day in possible_days:
        new_rank = preference_ranks.get(new_day, 10)
        # Only mutate if it's valid AND strictly better in preference
        if new_rank < current_rank and chromosome.is_swap_valid(family_idx, new_day, family_size):
            chromosome.update_attendance(family_idx, current_day, new_day, family_size)
            chromosome.assigned_days[family_idx] = new_day
            break

In [14]:
def reproduction(
        mutation_func,
        parents,
        crossover_proba=1.0,
        allow_single_swap=False,
        random_order=False,
        mutation_rate=0.01
):
    next_generation = []
    for i in range(0, len(parents), 2):
        parent1, parent2 = parents[i], parents[i + 1]
        child1, child2 = crossover(parent1, parent2, p=crossover_proba, allow_single_swap=allow_single_swap, random_order=random_order)
        mutation_func(child1, mutation_rate)
        mutation_func(child2, mutation_rate)
        next_generation.extend([child1, child2])
    return next_generation

In [15]:
def epoch_optimal(population):
    costs = [cost_function(ind.assigned_days, family_size, choice_rank, penalties_array) for ind in population]
    best_idx = np.argmin(costs)
    return population[best_idx], costs[best_idx]

In [16]:
def genetic_algorithm(
        mutation_func,
        pop_size=100,
        num_generations=200,
        tournament_size=5,
        crossover_proba=1.0,
        allow_single_swap=False,
        random_order=False,
        mutation_rate=0.1,
        elitism_ratio=0.1
):

    # Initialize population
    population = [initialize_chromosome() for _ in range(pop_size)]
    costs = [cost_function(ind.assigned_days, family_size, choice_rank, penalties_array) for ind in population]

    # Sort population by cost
    sorted_indices = np.argsort(costs)
    population = [population[i] for i in sorted_indices]
    costs = [costs[i] for i in sorted_indices]

    best_chromosome = population[0].copy()
    best_cost = costs[0]

    for generation in range(num_generations):
        # Elitism: preserve top individuals
        elite_size = max(1, int(elitism_ratio * pop_size))
        elites = [population[i].copy() for i in range(elite_size)]

        # Selection
        parents = selection(population, pop_size - elite_size, tournament_size)

        # Reproduction
        offspring = reproduction(mutation_func, parents, crossover_proba, allow_single_swap, random_order, mutation_rate)

        # Combine elites and offspring
        population = elites + offspring

        # Find the best in the current generation
        current_best, current_cost = epoch_optimal(population)

        # Update the best solution
        if current_cost < best_cost:
            best_chromosome, best_cost = current_best.copy(), current_cost

        print(f"Generation {generation + 1}: Best Cost = {best_cost}")
    return best_chromosome, best_cost

In [37]:
best_ch, best_c = genetic_algorithm(
        pop_size=100,
        num_generations=4000,
        tournament_size=5,
        crossover_proba=0.5,
        allow_single_swap=True,
        random_order=True,
        mutation_func=mutation1,
        mutation_rate=0.3,
        elitism_ratio=0.1
    )

print("Best Chromosome:", best_ch)
print("Best Cost:", best_c)

Generation 1: Best Cost = 1868638.2853920883
Generation 2: Best Cost = 1860638.5771131455
Generation 3: Best Cost = 1829250.757542388
Generation 4: Best Cost = 1829250.757542388
Generation 5: Best Cost = 1829250.757542388
Generation 6: Best Cost = 1812206.6012110265
Generation 7: Best Cost = 1799824.3034857383
Generation 8: Best Cost = 1767578.2649515006
Generation 9: Best Cost = 1743060.8336982622
Generation 10: Best Cost = 1743060.8336982622
Generation 11: Best Cost = 1743060.8336982622
Generation 12: Best Cost = 1743060.8336982622
Generation 13: Best Cost = 1743060.8336982622
Generation 14: Best Cost = 1743060.8336982622
Generation 15: Best Cost = 1714249.1693021904
Generation 16: Best Cost = 1703596.7410701208
Generation 17: Best Cost = 1681562.5562743882
Generation 18: Best Cost = 1681562.5562743882
Generation 19: Best Cost = 1675257.2261413743
Generation 20: Best Cost = 1661355.3391915266
Generation 21: Best Cost = 1652635.602239817
Generation 22: Best Cost = 1642679.6887456481
G

In [38]:
submission['assigned_day'] = best_ch.assigned_days
submission.to_csv('data/submission.csv', index=False)