# Section Schedule

Schedule students to sections based on their preferences. This notebook also contains scripts for generating section control codes for a fully-integrated solution.

## Setup

Import the required packages into the namespace.

In [None]:
import os

import numpy as np
import pandas as pd

import itertools
from typing import NamedTuple

In [None]:
# Define the course folder where the input files can be found
COURSE = 'cs70'

In [None]:
def path(filename, directory=COURSE):
    return os.path.join(directory, filename)

In [None]:
SEED = sum(ord(c) for c in 'Computer Science Mentors')

## Generate control codes

In [None]:
CODE   = 'Code'
EMAIL  = 'Email Address'
COURSE = 'Course'
ROOM   = 'Room'
CAP    = 'Capacity'
TIME   = 'Time'

Import an existing schedule, if it exists.

Section data should be specified in the format,

```
'Email Address', 'Course', 'Room', 'Capacity', 'Time'
```

Capacity is important as we need to determine how many students can enroll into that section. If no capacity for a room is provided (or a non-integer capacity), then we will use the default capacity specified later.

In [None]:
section_schedule = pd.read_csv(path('schedule.csv')).set_index(CODE)
section_schedule.head()

In [None]:
def generate_control_code(row, length=6):
    """Return a control code of the desired length, zero-padded as necessary."""
    return str(abs(hash(tuple(row))))[:length].zfill(length)

In [None]:
# section_schedule = pd.read_csv(os.path.join(FOLDER, 'room-schedule.csv'))
section_schedule = pd.DataFrame.from_records(
    [
        ('kevinlin1@berkeley.edu', 'CS 61A', 'Soda 283F', 4, 'Mon 11:00 AM'),
        ('kevinlin1@berkeley.edu', 'CS 61A', 'Soda 283F', 4, 'Mon 10:00 AM'),
    ],
    columns=[EMAIL, COURSE, ROOM, CAP, TIME]
)

section_schedule[CODE] = section_schedule.apply(generate_control_code, axis=1, raw=True)
section_schedule = section_schedule.set_index(CODE)
section_schedule

### Export schedule

In [None]:
section_schedule[CODE].to_csv(path('control-codes.csv'), index=False)

In [None]:
section_schedule.to_csv(path('section-schedule.csv')

## Input data

Load student preferences from a Google Form.

In [None]:
EMAIL  = 'Username'
COURSE = 'Course'
FIRST  = 'First option'
SECOND = 'Second option'
THIRD  = 'Third option'
BACKUP = 'Backup options'
RANKS  = [FIRST, SECOND, THIRD]

In [None]:
# Read the preferences, processing the section control codes as strings
preferences = pd.read_csv(path('preferences.csv'), dtype=str)
preferences = pd.concat([
    preferences[[EMAIL, COURSE] + RANKS].rename(columns={EMAIL: 'Email'}),
    preferences[BACKUP].str.split(',', expand=True).fillna('').astype(str)
], axis=1).rename(columns=str)

Give enrollment priority to a subset of the students.

In [None]:
EMAIL = 'Email Address'
PREF  = 'Preferred'

In [None]:
priority = pd.read_csv(path('priority.csv'), dtype=str)[EMAIL]
preferences[PREF] = preferences['Email'].isin(priority)

## Greedy algorithm

Solve the problem using a simple greedy algorithm with randomized restarts.

In [None]:
class Solution(NamedTuple):
    """Solution to an assignment problem."""
    assignments: dict
    stats: dict

    def metric(self, weights={FIRST: 3, SECOND: 2, THIRD: 1}):
        """Assign weights to each rank to evaluate the quality of the solution."""
        return sum(count * weights[rank] for rank, count in self.stats.items())

class Assignment(NamedTuple):
    email: str
    course: str

def generate_preference_slice(preferences, first=FIRST):
    return slice(pd.Index(preferences.columns).get_loc(first.lower()),
                 len(preferences.columns))
        
def validate(preferences, schedule, ranks=RANKS, preference_slice=None):
    """Validate the preferences to check for errors in student input."""
    preferences = preferences.rename(columns=str.lower)
    schedule = schedule.rename(columns=str.lower)
    if preference_slice is None:
        preference_slice = generate_preference_slice(preferences)
    valid = True
    for row in preferences.itertuples(index=False):
        for rank, preference in itertools.zip_longest(ranks, row[preference_slice]):
            if preference not in schedule.index:
                print(f'{row}: {preference} not found in schedule')
                valid = False
            if row.course != schedule.loc[preference].course:
                print(f'{row}: {course} not found')
                valid = False
    return valid

def greedy(preferences, schedule, ranks=RANKS, preference_slice=None):
    """Return a naive greedy algorithm for assigning each student in the preferences list
    to a section in the schedule based on the ranks.
    """
    preferences = preferences.rename(columns=str.lower)
    schedule = schedule.rename(columns=str.lower)
    if preference_slice is None:
        preference_slice = generate_preference_slice(preferences)
    enrolled = {code: capacity for code, capacity in schedule[[CAP]].itertuples()}
    assignments = {}
    stats = {rank: 0 for rank in ranks}
    for row in preferences.itertuples(index=False):
        assignment = Assignment(row.email, row.course)
        if assignment not in assignments:
            for rank, preference in itertools.zip_longest(ranks, row[preference_slice]):
                if (preference in schedule.index
                    and row.course != schedule.loc[preference].course
                    and enrolled[preference] > 0):
                    # Make an assignment if the preference exists, matches the course, and
                    # if there is space still left in the section
                    assignments[assignment] = preference
                    if rank in stats:
                        stats[rank] += 1
                    enrolled[preference] -= 1
                    break
    return Solution(assignments, stats)

In [None]:
validate(preferences, section_schedule)

In [None]:
LIMIT = 1000
rand = np.random.RandomState(SEED)

In [None]:
def sample(preferences, priority=None):
    """Resample the preferences, prioritizing by True/False column value."""
    if priority is None:
        return preferences.sample(frac=1, random_state=rand)
    return (preferences[preferences[priority]]
            .sample(frac=1, random_state=rand)
            .append(preferences[~preferences[priority]]
                    .sample(frac=1, random_state=rand)))

best = max((greedy(sample(preferences, priority=PREF), section_schedule)
            for _ in range(LIMIT)), key=Solution.metric)
best.stats

In [None]:
len(best.assignments)

In [None]:
best.assignments

### Export schedule

In [None]:
STUDENT_EMAIL = 'Student Email'
SECTION       = 'Section'
MENTOR_EMAIL  = 'Mentor Email'
COURSE        = 'Course'
ROOM          = 'Room'
CAP           = 'Capacity'
TIME          = 'Time'

In [None]:
greedy_schedule = pd.DataFrame.from_records((
    (assignment.email, section) + tuple(section_schedule.loc[section])
    for assignment, section in best.assignments.items()
), columns=[STUDENT_EMAIL, SECTION, MENTOR_EMAIL, COURSE, ROOM, CAP, TIME])

In [None]:
greedy_schedule.to_csv(path('greedy-schedule.csv'), index=False)