## Standard Genetic Algorithm Implementation

Developed by Daniel Frutos Rodriguez for _"A Comparison of Optimization Techniques for Solving the Team Formation Problem"_.

Frutos-Rodriguez D., Barrios-Fleitas Y., and Lalla E. 2025. _A Comparison of Optimization Techniques for Solving the Team Formation Problem._ In _Proceedings Placeholder_. ACM, New York, NY, USA, 6 pages. [https://doi.org/10.1145/nnnnnnn.nnnnnnn](https://doi.org/10.1145/nnnnnnn.nnnnnnn)

#### Libraries

In [1]:
import pandas as pd # For datasets
import utils.fitness_functions as ff
import utils.exhaustive_solver as es
import utils.restriction_checker as rc
import models.team_assignment as ta
import random # For randomness
import os # For output
from contextlib import redirect_stdout # For output
import datetime # For output
from copy import deepcopy # For crossover function
import numpy # For doing repetitive runs and getting mean values

# Some imports that may be used later (for plotting or mathematical computations)
# import numpy as np
# import matplotlib.pyplot as plt
# import statsmodels.api as sm
# import seaborn as sns

#### Constants

In [None]:
PROJECTS = ['Shotmaniacs', 'actFact', 'Honours Programme', 'Voice', 'Topicus', 'Earnit', 'Inter-actief'] # Possible projects.
DATA = 'data\synthetic_dataset_3.csv' # Dataset reference.
SYNTHETIC_DATASET = pd.read_csv(DATA) # Dataset stored as a dataframe.

NUMBER_OF_GENERATIONS = 100 # Number of generations to iterate in the GA loop.
NUMBER_OF_REPEATS = 10 # Number of repeats before introducing systematic randomness.
PARENT_RESET_THRESHOLD = 10 # Number of resets before replacing parents.
MAX_TEAM_CREATION_ATTEMPTS = 10 # Maximum number of attempts in initial population creation (avoiding infinite loop).
MAX_CROSSOVER_ATTEMPTS = 10 # Maximum number of attempts in crossover algorithm (avoiding infinite loop).
MAX_MUTATION_ATTEMPTS = 10 # Maximum number of attempts in mutation algorithm (avoiding infinite loop).

COMPUTE_OPTIMAL_SOLUTION = True # (Only for small synthetic datasets), computes optimal solution through exhaustive solver.
EFFICIENCY_GOAL = None # (Only for small synthetic datasets), allows early finishing of the algorithm if a certain efficiency
                     # (score / score of best possible solution) is met.

OFFSPRING_PER_GENERATION = 8 # Number of offspring 'birthed' in each iteration of the GA.

RUN_TIME = datetime.datetime.now().strftime("run_%Y%m%d_%H%M%S") # Constant for output folder naming.
BASE_DIR = "output" # Base directory for outputs.

#### Output Configuration

In [3]:
def save_arrangement_batch(arrangements, SYNTHETIC_DATASET, iteration, run_index=None):
    # Create output directory
    os.makedirs(BASE_DIR, exist_ok=True)

    if run_index is None:
        # Default: timestamped folder
        run_index = RUN_TIME

    run_folder = os.path.join(BASE_DIR, f"{run_index}", f"generation_{iteration}")
    os.makedirs(run_folder, exist_ok=True)

    scores = []

    # Evaluate and save each arrangement
    for i, arrangement in enumerate(arrangements, start=1):
        filename = os.path.join(run_folder, f"assignment_{i}.txt")
        with open(filename, "w") as f:
            with redirect_stdout(f):
                score = ff.evaluate_all_teams(arrangement, SYNTHETIC_DATASET)
                print(f"\nFinal score for arrangement {i}: {score:.4f}")
                scores.append(score)
                
    return scores

#### Initial Population Generation

In [4]:
def create_random_teams(df, min_size=5, max_size=6, project_pool=PROJECTS):
    attempts = 0
    while attempts < MAX_TEAM_CREATION_ATTEMPTS:
        students = df.to_dict(orient='records')
        random.shuffle(students)

        total_students = len(students)
        teams = []
        project_counters = {} # Track number of teams per project
        i = 0
        while total_students - i >= min_size:

            remaining = total_students - i
            if 11 <= remaining <= 12:
                team_size = 6 if remaining == 12 else 5  # Prevent ending in a 1-member team
            elif remaining in [10, 9, 8, 7]:
                team_size = 5
            else:
                team_size = random.choice([5, 6])

            team = students[i:i+team_size]

            if not rc.is_valid_team(team):
                i += 1  # Skip just 1 student and try again
                continue

            project = random.choice(project_pool)

            # Track how many times we've used this project
            count = project_counters.get(project, 0) + 1
            project_counters[project] = count
            team_id = f"{project} {count}"
            teams.append(ta.TeamAssignment(team_id, team, project, fitness=0.0))

            i += team_size

        assigned_ids = [member['ID'] for team in teams for member in team.members]
        if set(assigned_ids) == set(df['ID']):
            return teams
        attempts += 1
    
    # Fallback: generate a random valid arrangement ignoring restrictions
    students = df.to_dict(orient='records')
    random.shuffle(students)
    fallback_teams = []
    i = 0
    project_counters = {}
    while i < len(students):
        team_size = min(6, len(students) - i)
        team = students[i:i+team_size]
        project = random.choice(project_pool)
        count = project_counters.get(project, 0) + 1
        project_counters[project] = count
        team_id = f"{project} {count}"
        fallback_teams.append(ta.TeamAssignment(team_id, team, project, fitness=0.0))
        i += team_size
    return fallback_teams
    

#### Selection

In [5]:
def select_parents(arrangements, scores):

    arrangements_with_scores = list(zip(arrangements, scores))
    arrangements_with_scores.sort(key=lambda x : x[1], reverse=True)

    top_2_arrangements = [arrangements_with_scores[0][0], arrangements_with_scores[1][0]]

    return top_2_arrangements

#### Crossover

In [6]:
# TODO Improve crossover function

def crossover(parent1, parent2, attempt_counter):
    crossover_attempts = 0
    while (crossover_attempts < MAX_CROSSOVER_ATTEMPTS):
        attempt_counter[0] += 1
        selected_teams = random.sample(parent1, k=random.randint(1, len(parent1) - 1))
        
        # Extract IDs from selected teams' members
        used_ids = set(member['ID'] for team in selected_teams for member in team.members)

        # Start building child from selected teams
        child_teams = deepcopy(selected_teams)

        # Add non-overlapping teams from parent2
        for team in parent2:
            team_ids = [member['ID'] for member in team.members]
            if any(id in used_ids for id in team_ids):
                continue
            child_teams.append(deepcopy(team))
            used_ids.update(team_ids)

        # Fill remaining students
        all_ids = SYNTHETIC_DATASET['ID'].tolist()
        remaining_ids = [id for id in all_ids if id not in used_ids]
        df_by_id = SYNTHETIC_DATASET.set_index('ID').to_dict(orient='index')
        remaining_students = [dict(df_by_id[id], ID=id) for id in remaining_ids]

        if remaining_students:
            remaining_df = pd.DataFrame(remaining_students)
            new_teams = create_random_teams(remaining_df)
            child_teams += new_teams

        if rc.is_valid_arrangement(child_teams, total_students=len(SYNTHETIC_DATASET), projects=PROJECTS):
            return child_teams
        
        crossover_attempts += 1

    return create_random_teams(SYNTHETIC_DATASET) # Fallback. Random arrangement.

#### Mutation

In [7]:
def mutate(arrangement, attempt_counter):
    mutation_attempts = 0
    while mutation_attempts < MAX_MUTATION_ATTEMPTS:
        attempt_counter[0] += 1
        mutated = deepcopy(arrangement)

        # Pick two distinct teams
        team_a, team_b = random.sample(mutated, 2)

        # Pick one random member from each
        student_a = random.choice(team_a.members)
        student_b = random.choice(team_b.members)

        # Swap them
        team_a.members.remove(student_a)
        team_b.members.remove(student_b)
        team_a.members.append(student_b)
        team_b.members.append(student_a)

        # Check if resulting arrangement is valid
        if rc.is_valid_arrangement(mutated, total_students=len(SYNTHETIC_DATASET), projects=PROJECTS):
            return mutated
        
        mutation_attempts += 1

    return create_random_teams(SYNTHETIC_DATASET) # Fallback. Random arrangement.

#### GA Execution

In [8]:
# NUM_RUNS = 100
# all_generations = []
# all_computations = []

# for run in range(NUM_RUNS):

# 0. Initial Population Generation
arrangements = [create_random_teams(SYNTHETIC_DATASET) for _ in range(10)]

# 0.5 Computation of Best Possible Arrangement for Performance Measurement. Only feasible in small synthetic datasets due to TFP.
if (COMPUTE_OPTIMAL_SOLUTION):
    optimal_arrangement, optimal_score, arrangements_computed = es.find_best_arrangement(SYNTHETIC_DATASET, DATA)

# 0.75 Performance Log (For debugging, and ensuring that score improves with each generation).
performance_log = os.path.join(BASE_DIR, f"{RUN_TIME}", f"performance.txt")
os.makedirs(os.path.dirname(performance_log), exist_ok=True)

with open(performance_log, "w") as f:
    f.write(f"Performance log for run: {RUN_TIME}\n")
    if (COMPUTE_OPTIMAL_SOLUTION):
        f.write(f"Optimal score: {optimal_score:.4f}\n\n")

iters = 0
best_score = -1
efficiency = -1
repeat_count = 0 # Counter to introduce randomness upon stagnation.
parent_reset_count = 0 # Counts the number of random replacements, starting at an entirely new point if a better solution isn't found.
attempted__arrangements = [0]
while iters <= NUMBER_OF_GENERATIONS and (EFFICIENCY_GOAL is None or efficiency < EFFICIENCY_GOAL):

    # 1. Fitness Evaluation
    scores = save_arrangement_batch(arrangements, SYNTHETIC_DATASET, iters) # Evaluates all teams, and outputs to output folder. Returns score of each arrangement.
    current_max = max(scores)

    # Check for improvement
    if current_max >= best_score:
        if current_max > best_score:
            repeat_count = 0  # Improvement found
            best_score = current_max
        else:
            repeat_count += 1  # No improvement, stagnation
    else:
        repeat_count += 1  # Score dropped, treat as stagnation


    # Log the best score in the generation
    if (COMPUTE_OPTIMAL_SOLUTION):
        efficiency = round((current_max / optimal_score) * 100, 2)

    with open(performance_log, "a") as f:
        f.write(
            f"Generation {iters}: Best score = {current_max:.4f}"
            + (f" | Efficiency = {efficiency:.2f}%" if COMPUTE_OPTIMAL_SOLUTION else "")
            + (f" | Total Computations: {attempted__arrangements}")
            + (f" of {arrangements_computed:.0f} Possible Combinations" if COMPUTE_OPTIMAL_SOLUTION else "")
            + "\n"
        )

    # Optional: stop early if efficiency goal is met
    if EFFICIENCY_GOAL is not None and efficiency >= EFFICIENCY_GOAL:
        break

    if current_max > best_score:
        best_score = current_max

    # 2. Selection
    parents = select_parents(arrangements, scores)

    # 3. Crossover & Mutation
    offspring = []
    if repeat_count >= NUMBER_OF_REPEATS:
        # Introduce 8 random offspring + 2 parents
        offspring = [create_random_teams(SYNTHETIC_DATASET) for _ in range(8)]
        repeat_count = 0  # Reset the stagnation counter
        parent_reset_count += 1 # Log reset
        if parent_reset_count >= PARENT_RESET_THRESHOLD:
            parents = [create_random_teams(SYNTHETIC_DATASET) for _ in range(2)]
            parent_reset_count = 0
        arrangements = parents + offspring # Keep the 2 best parents, or two new parents if condition is met.
    else:
        while len(offspring) < OFFSPRING_PER_GENERATION:
            child = crossover(parents[0], parents[1], attempted__arrangements)
            mutated_child = mutate(child, attempted__arrangements)
            offspring.append(mutated_child)
        arrangements = parents + offspring # Keeps parents in next generation, as their scores will often be better than children's.

    iters += 1

    # all_generations.append(iters)
    # all_computations.append(attempted__arrangements[0])

# average_generations = numpy.mean(all_generations)
# average_computations = numpy.mean(all_computations)

# print(average_generations)
# print(average_computations)
