## Standard Genetic Algorithm Implementation

#### Libraries

In [33]:
import pandas as pd # For datasets
import utils.fitness_functions as ff
import utils.monoobjective_exhaustive_solver as es
import utils.restriction_checker as rc
import models.team_assignment as ta
import random # For randomness
import os # For output
from contextlib import redirect_stdout # For output
import datetime # For output
from copy import deepcopy # For crossover function
import math # For ceiling function for team partitions
from collections import defaultdict

#### Constants

In [34]:
# Data

PROJECTS = ['Shotmaniacs', 'actFact', 'Honours Programme', 'Voice', 'Topicus', 'Earnit', 'Inter-actief', 'Rosen', 'eMagiz', 'NoteBridge', 'Entweder'] # Possible projects.
DATA = 'data/2022_23_arrangement.csv' # Dataset reference.
DATASET = pd.read_csv(DATA) # Dataset stored as a dataframe.

# Tuning parameters

PARENT_RESET_THRESHOLD = 10 # Number of repeats before replacing parents.

# Safeguards

MAX_TEAM_CREATION_ATTEMPTS = 10 # Maximum number of attempts in initial population creation (avoiding infinite loop).
MAX_CROSSOVER_ATTEMPTS = 10 # Maximum number of attempts in crossover algorithm (avoiding infinite loop).
MAX_MUTATION_ATTEMPTS = 10 # Maximum number of attempts in mutation algorithm (avoiding infinite loop).

# Experimental controls

NUMBER_OF_GENERATIONS = 100 # Number of generations to iterate in the GA loop.
OFFSPRING_PER_GENERATION = 8 # Number of offspring 'birthed' in each iteration of the GA.
COMPUTE_OPTIMAL_SOLUTION = False # (Only for small synthetic datasets), computes optimal solution through exhaustive solver.
EFFICIENCY_GOAL = None # (Only for small synthetic datasets), allows early finishing of the algorithm if a certain efficiency
                     # (score / score of best possible solution) is met.

# Output constants

RUN_TIME = datetime.datetime.now().strftime("run_%Y%m%d_%H%M%S") # Constant for output folder naming.
BASE_DIR = "output/ega" # Base directory for outputs.

#### Output Configuration

In [35]:
def save_arrangement_batch(arrangements, iteration, run_time=RUN_TIME):
    # Create output directory
    os.makedirs(BASE_DIR, exist_ok=True)

    run_folder = os.path.join(BASE_DIR, f"{run_time}", f"generation_{iteration}")
    os.makedirs(run_folder, exist_ok=True)

    scores = []

    # Evaluate and save each arrangement
    for i, arrangement in enumerate(arrangements, start=1):
        filename = os.path.join(run_folder, f"assignment_{i}.txt")
        with open(filename, "w") as f:
            with redirect_stdout(f):
                score = ff.evaluate_all_teams(arrangement)
                print(f"\nFinal score for arrangement {i}: {score:.4f}")
                scores.append(score)
                
    return scores

#### Initial Population Generation

In [36]:
def create_random_teams(df, min_size=5, max_size=6, project_pool=PROJECTS):
    students = df.to_dict(orient='records')
    random.shuffle(students)

    total_students = len(students)
    remainder = total_students % min_size
    num_six_person_teams = remainder
    num_five_person_teams = (total_students - (max_size * num_six_person_teams)) // min_size
    total_teams = num_five_person_teams + num_six_person_teams

    def assign_projects_to_teams(num_teams, projects):
        base = num_teams // len(projects)
        extra = num_teams % len(projects)
        project_assignments = []
        for idx, project in enumerate(projects):
            count = base + (1 if idx < extra else 0)
            project_assignments.extend([project] * count)
        random.shuffle(project_assignments)
        return project_assignments

    project_assignments = assign_projects_to_teams(total_teams, project_pool)
    project_counters = {proj: 0 for proj in project_pool}

    teams = []

    for _ in range(num_six_person_teams):
        team = []
        tcs_count = 0
        nationality_counts = {}

        while len(team) < max_size:
            valid_candidates = [
                s for s in students
                if ('TCS' not in s['Program'] or tcs_count < 4)
                and (s['Nationality'] == 'Dutch' or nationality_counts.get(s['Nationality'], 0) < 3)
            ]

            selected = random.choice(valid_candidates)
            team.append(selected)
            students.remove(selected)

            if 'TCS' in selected['Program']:
                tcs_count += 1
            if selected['Nationality'] != 'Dutch':
                nationality_counts[selected['Nationality']] = nationality_counts.get(selected['Nationality'], 0) + 1

        project = project_assignments[len(teams)]
        project_counters[project] += 1
        team_id = f"{project} {project_counters[project]}"
        teams.append(ta.TeamAssignment(team_id, team, project, fitness=0.0))

    for _ in range(num_five_person_teams):
        team = []
        tcs_count = 0
        nationality_counts = {}

        while len(team) < min_size:
            valid_candidates = [
                s for s in students
                if ('TCS' not in s['Program'] or tcs_count < 4)
                and (s['Nationality'] == 'Dutch' or nationality_counts.get(s['Nationality'], 0) < 3)
            ]
            if not valid_candidates:
                return None

            selected = random.choice(valid_candidates)
            team.append(selected)
            students.remove(selected)

            if 'TCS' in selected['Program']:
                tcs_count += 1
            if selected['Nationality'] != 'Dutch':
                nationality_counts[selected['Nationality']] = nationality_counts.get(selected['Nationality'], 0) + 1

        project = project_assignments[len(teams)]
        project_counters[project] += 1
        team_id = f"{project} {project_counters[project]}"
        teams.append(ta.TeamAssignment(team_id, team, project, fitness=0.0))

    valid, _ = rc.is_valid_arrangement(teams, total_students, project_pool)

    return teams if valid else None


# Method for creating random teams as a subset of the crossover function, ensuring restrictions are met for the whole dataset and not just a subset.
def create_random_teams_for_subset(df, current_project_counters, max_per_project, min_size=5, max_size=6):
    students = df.to_dict(orient='records')
    random.shuffle(students)

    total_students = len(students)
    remainder = total_students % min_size
    num_six_person_teams = remainder
    num_five_person_teams = (total_students - (max_size * num_six_person_teams))

    teams = []

    # Build the teams
    for _ in range(num_six_person_teams):
        team = []
        tcs_count = 0
        nationality_counts = {}
        while len(team) < max_size:
            valid_candidates = [
                s for s in students
                if ('TCS' not in s['Program'] or tcs_count < 4)
                and (s['Nationality'] == 'Dutch' or nationality_counts.get(s['Nationality'], 0) < 3)
            ]
            if not valid_candidates:
                return None
            selected = random.choice(valid_candidates)
            team.append(selected)
            students.remove(selected)
            if 'TCS' in selected['Program']:
                tcs_count += 1
            if selected['Nationality'] != 'Dutch':
                nationality_counts[selected['Nationality']] = nationality_counts.get(selected['Nationality'], 0) + 1

        available_projects = [proj for proj, count in current_project_counters.items() if count < max_per_project]
        if not available_projects:
            return None
        project = random.choice(available_projects)
        current_project_counters[project] += 1
        team_id = f"{project} {current_project_counters[project]}"
        teams.append(ta.TeamAssignment(team_id, team, project, fitness=0.0))

    for _ in range(num_five_person_teams):
        team = []
        tcs_count = 0
        nationality_counts = {}
        while len(team) < min_size:
            valid_candidates = [
                s for s in students
                if ('TCS' not in s['Program'] or tcs_count < 4)
                and (s['Nationality'] == 'Dutch' or nationality_counts.get(s['Nationality'], 0) < 3)
            ]
            if not valid_candidates:
                return None
            selected = random.choice(valid_candidates)
            team.append(selected)
            students.remove(selected)
            if 'TCS' in selected['Program']:
                tcs_count += 1
            if selected['Nationality'] != 'Dutch':
                nationality_counts[selected['Nationality']] = nationality_counts.get(selected['Nationality'], 0) + 1

        available_projects = [proj for proj, count in current_project_counters.items() if count < max_per_project]
        if not available_projects:
            return None
        project = random.choice(available_projects)
        current_project_counters[project] += 1
        team_id = f"{project} {current_project_counters[project]}"
        teams.append(ta.TeamAssignment(team_id, team, project, fitness=0.0))

    return teams

    

#### Selection

In [37]:
def select_parents(arrangements, scores):

    arrangements_with_scores = list(zip(arrangements, scores))
    arrangements_with_scores.sort(key=lambda x : x[1], reverse=True)

    top_2_arrangements = [arrangements_with_scores[0][0], arrangements_with_scores[1][0]]

    return top_2_arrangements

#### Crossover

In [38]:
def crossover(parent1, parent2, attempt_counter):
    crossover_attempts = 0
    while (crossover_attempts < MAX_CROSSOVER_ATTEMPTS):
        attempt_counter[0] += 1
        selected_teams = random.sample(parent1, k=random.randint(1, len(parent1) - 1))
        
        # Extract IDs from selected teams' members
        used_ids = set(member['ID'] for team in selected_teams for member in team.members)

        # Start building child from selected teams
        child_teams = deepcopy(selected_teams)

        # Add non-overlapping teams from parent2
        for team in parent2:
            team_ids = [member['ID'] for member in team.members]
            if any(id in used_ids for id in team_ids):
                continue
            child_teams.append(deepcopy(team))
            used_ids.update(team_ids)

        # Determine max_per_project for the full dataset
        total_teams_full = len(DATASET) // 5
        max_per_project = math.ceil(total_teams_full / len(PROJECTS))

        # Compute used project counts so far
        from collections import Counter
        used_project_counts = Counter(team.project for team in child_teams)

        # Fill remaining students while respecting project limits
        all_ids = DATASET['ID'].tolist()
        remaining_ids = [id for id in all_ids if id not in used_ids]
        df_by_id = DATASET.set_index('ID').to_dict(orient='index')
        remaining_students = [dict(df_by_id[id], ID=id) for id in remaining_ids]

        if remaining_students:
            remaining_df = pd.DataFrame(remaining_students)
            new_teams = create_random_teams_for_subset(
                remaining_df, 
                current_project_counters=used_project_counts, 
                max_per_project=max_per_project
            )
            if new_teams is None:
                crossover_attempts += 1
                continue
            child_teams += new_teams

        valid, _ = rc.is_valid_arrangement(child_teams, total_students=len(DATASET), projects=PROJECTS)

        if valid:
            return child_teams
        
        crossover_attempts += 1

    return create_random_teams(DATASET) # Fallback. Random arrangement.

#### Mutation

In [39]:
def mutate(arrangement, attempt_counter):
    mutation_attempts = 0
    while mutation_attempts < MAX_MUTATION_ATTEMPTS:
        attempt_counter[0] += 1
        mutated = deepcopy(arrangement)

        if not isinstance(mutated, list):
            mutated = list(mutated)

        # Pick two distinct teams
        team_a, team_b = random.sample(mutated, 2)

        # Assign a 50% chance to swap the projects or team members

        if (random.random() < 0.5):

            # Self the team project
            project_a = team_a.project
            project_b = team_b.project

            # Swap them
            team_a.project = project_b
            team_b.project = project_a

        else:

            # Pick one random member from each
            student_a = random.choice(team_a.members)
            student_b = random.choice(team_b.members)

            # Swap them
            team_a.members.remove(student_a)
            team_b.members.remove(student_b)
            team_a.members.append(student_b)
            team_b.members.append(student_a)

        # Check if resulting arrangement is valid
        valid, _ = rc.is_valid_arrangement(mutated, total_students=len(DATASET), projects=PROJECTS)
        if valid:
            return mutated
        
        mutation_attempts += 1

    return create_random_teams(DATASET) # Fallback. Random arrangement.

#### GA Execution

In [40]:
def execute():

    df = pd.DataFrame({
        "score": pd.Series(dtype="float"),
        "number of computations": pd.Series(dtype="float")
    })

    # 0. Initial Population Generation
    arrangements = []

    while len(arrangements) < 10:
        arrangement = create_random_teams(DATASET)
        if arrangement is not None:
            arrangements.append(arrangement)

    # 0.5 Computation of Best Possible Arrangement for Performance Measurement. Only feasible in small synthetic datasets due to TFP.
    if (COMPUTE_OPTIMAL_SOLUTION):
        optimal_arrangement, optimal_score, arrangements_computed = es.find_best_arrangement(DATASET, DATA)

    # 0.75 Performance Log (For debugging, and ensuring that score improves with each generation).
    run_time = datetime.datetime.now().strftime("run_%Y%m%d_%H%M%S")
    performance_log = os.path.join(BASE_DIR, f"{run_time}", f"performance.txt")
    os.makedirs(os.path.dirname(performance_log), exist_ok=True)

    with open(performance_log, "w") as f:
        f.write(f"Performance log for run: {run_time}\n")
        if (COMPUTE_OPTIMAL_SOLUTION):
            f.write(f"Optimal score: {optimal_score:.4f}\n\n")

    iters = 0
    best_score = -1
    previous_score = -1
    best_arrangement = None
    efficiency = -1
    stagnation_count = 0 # Counter to introduce randomness upon stagnation.
    attempted__arrangements = [0]
    while iters <= NUMBER_OF_GENERATIONS and (EFFICIENCY_GOAL is None or efficiency < EFFICIENCY_GOAL):

        # 1. Fitness Evaluation
        scores = save_arrangement_batch(arrangements, iters, run_time) # Evaluates all teams, and outputs to output folder. Returns score of each arrangement.
        current_max = max(scores)

        for i, score in enumerate(scores):
            if score > best_score:
                best_score = score
                best_arrangement = arrangements[i]

        if current_max > previous_score:
            stagnation_count = 0
        else:
            stagnation_count += 1  # No improvement, stagnation

        # Log the best score in the generation
        if (COMPUTE_OPTIMAL_SOLUTION):
            efficiency = round((best_score / optimal_score) * 100, 2)

        with open(performance_log, "a") as f:
            f.write(
                f"Generation {iters}: Best score = {best_score:.4f}"
                + (f" | Efficiency = {efficiency:.2f}%" if COMPUTE_OPTIMAL_SOLUTION else "")
                + (f" | Total Computations: {attempted__arrangements}")
                + (f" of {arrangements_computed:.0f} Possible Combinations" if COMPUTE_OPTIMAL_SOLUTION else "")
                + "\n"
            )

        # Optional: stop early if efficiency goal is met
        if EFFICIENCY_GOAL is not None and efficiency >= EFFICIENCY_GOAL:
            break

        if current_max > best_score:
            best_score = current_max


        # 2. Selection
        parents = select_parents(arrangements, scores)

        # 3. Crossover & Mutation with fallback on stagnation
        if stagnation_count >= PARENT_RESET_THRESHOLD:
            # Full reset of parents after too much stagnation
            parents = []
            while len(parents) < 2:
                new_parent = create_random_teams(DATASET)
                if new_parent is not None:
                    parents.append(new_parent)
            stagnation_count = 0  # Reset counter after fallback

        # Offspring creation
        offspring = []
        while len(offspring) < OFFSPRING_PER_GENERATION:
            child = crossover(parents[0], parents[1], attempted__arrangements)
            if child is None:
                continue
            mutated_child = mutate(child, attempted__arrangements)
            offspring.append(mutated_child)

        arrangements = parents + offspring  # Proceed to next generation

        previous_score = current_max
        iters += 1

        df.loc[iters] = [best_score, attempted__arrangements[0]]

    return df, best_arrangement
