In [218]:
import sys
from pathlib import Path

import pandas as pd

current_dir = Path.cwd()
parent_dir = current_dir.resolve().parent
sys.path.append(str(parent_dir))

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [198]:
from utils.utils import (
    generate_data,
    generate_rejections,
    generate_roster,
    print_project_summary,
    process_applications,
    student_assignment,
)

In [199]:
file_location = "../data/Spring 25 Matching.xlsx"
prioritized_students = ["ceden26@uchicago.edu"]
override_assignments = {"kbarbarossa@uchicago.edu": "Data and Democracy"}
deprioritized_students = [
    "nalinb@uchicago.edu",
    "wanniyang@uchicago.edu",
    "garakishig@uchicago.edu",
    "xingyue@uchicago.edu",
]
technical_project_list = [
    "Argonne",
    "Center for Living Systems",
    "CMAP",
    "Satellite-Based Detection of Ancient Water Systems",
    "SPUN",
]
drop_project_list = [
    "Satellite-Based Detection of Ancient Water Systems",
    "SPUN",
]
not_running_anymore = [
    "Morningstar",
    "Rwanda - Landslides",
    "Rwanda - Climate",
    "BDC",
    "Invenergy",
    "IRC",
    "Fermi - Simulations",
    "RAFI - Poultry",
]

application_df, forced_assignments = process_applications(
    file_location,
    deprioritized_students,
    prioritized_students,
    not_running_anymore,
    override_assignments,
)

data_to_process_dict = generate_data(application_df, technical_project_list)

priority_weights = [1, 5, 10, 20, 10e10]
max_students = {}
all_project_list = data_to_process_dict.pop("all_project_list")

assignment_df = student_assignment(
    priority_weights=priority_weights,
    **data_to_process_dict,
    max_students_dict=max_students,
    preassigned_students=forced_assignments,
    number_of_projects_to_run=None,
    drop_projects=drop_project_list,
    verbose=False,
)

print(f"\nAverage Ranking: {assignment_df.Ranking.mean():.2f}")
print(assignment_df.Ranking.value_counts())
print_project_summary(assignment_df, all_project_list)

assignment_df.to_csv("../data/assignments.csv", index=False)

Dropping 0 students because they do not have a computer and not required
Total Students available for matching: 92
Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/hannifan/projects/the-clinic/.venv/lib/python3.10/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/sr/lrm99gp93rx47xspcx2z49gr0000gp/T/0612035fc4db44e0919dcceb40054b3d-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/sr/lrm99gp93rx47xspcx2z49gr0000gp/T/0612035fc4db44e0919dcceb40054b3d-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 1087 COLUMNS
At line 10853 RHS
At line 11936 BOUNDS
At line 13144 ENDATA
Problem MODEL has 1082 rows, 1207 columns and 6161 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 348 - 0.00 seconds
Cgl0002I 966 variables fixed
Cgl0003I 12 fixed, 0 tightened bounds, 3 strengthened rows, 0 substitutions
Cgl0004I pr

Unnamed: 0,Project Name,Number,High Priority,High-Med Priority,Med Priority,Low Priority,Experienced,Rankings,Average Ranking
0,Data and Democracy,4,3,0,1,0,1,0.0,0.0
1,IDI,4,3,1,0,0,2,1.0,0.25
2,Center for Living Systems,4,2,1,1,0,2,3.0,0.75
3,CMAP,4,2,1,1,0,2,3.0,0.75
4,Argonne,4,2,1,1,0,1,111.0,0.75
5,College Financial Health,4,3,1,0,0,1,1111.0,1.0
6,UChicago Transportation,4,1,1,2,0,1,1111.0,1.0
7,Climate Cabinet,4,0,0,0,4,0,1111.0,1.0
8,Internet Equity Initiative,4,2,1,1,0,1,13.0,1.0
9,DSI - HPC,4,1,2,1,0,2,1112.0,1.25


In [None]:
# Check if each student in the applications is represented in the assignment df
applications = set(application_df["Email Address"])
assigned_project = set(
    assignment_df[assignment_df["Project Assigned"].notna()]["Email Address"]
)
not_assigned_project = set(
    assignment_df[assignment_df["Project Assigned"].isna()]["Email Address"]
)

assert (
    len(applications - assigned_project - not_assigned_project) == 0
), "Applications and assignments do not match"
print(
    f"Total Applications: {len(applications)}, Assigned Projects: {len(assigned_project)}, Not Assigned Projects: {len(not_assigned_project)}"
)

Total Applications: 92, Assigned Projects: 68, Not Assigned Projects: 24


In [201]:
# Check that no project in drop_project_list or not_running_anymore is assigned
assert (
    len(
        set(drop_project_list)
        & set(assignment_df["Project Assigned"].dropna())
    )
    == 0
), "Dropped projects found in assigned projects"
assert (
    len(
        set(not_running_anymore)
        & set(assignment_df["Project Assigned"].dropna())
    )
    == 0
), "Not running any more projects found in assigned projects"

In [202]:
# Generate roster for matching spreadsheet
roster = generate_roster(application_df, assignment_df)
roster.to_csv("../data/roster.csv", index=False)
print(roster.shape)
roster.head()

(68, 8)


Unnamed: 0,Project,Name,GitHub,Email,Chicago ID,Degree Program,Concentration,Returning
0,Argonne,Alexander Mosher,amosheruc,alexmosher@uchicago.edu,60954270X,Undergrad: 4th year,Data Science,0
1,Argonne,Mengyu Xu,rainxu0909,mxu09@uchicago.edu,12411055,MA or MS 2nd year,MS Computational and Applied Mathematics,0
2,Argonne,Qilin Zhou,QilinZhou56,qilin@uchicago.edu,31906783H,MA or MS 2nd year,MS Computational Analysis and Public Policy (M...,0
3,Argonne,Uday Malik,udaymalik12,umalik@uchicago.edu,41080320C,Undergrad: 4th year,Data Science,1
4,CMAP,Alan Cherman,alan-cherman,acherman@uchicago.edu,12282804,Undergrad: 4th year,Data Science,1


In [None]:
# Check if a project from previous quarter slipped through
assert (
    len([x for x in roster["Project"].unique() if x not in all_project_list])
    == 0
), "Found a project from previous quarter that should be dropped."

In [204]:
# Generate rejection csv
rejections = generate_rejections(assignment_df)
rejections.to_csv("../data/rejections.csv", index=False)
print(rejections.shape)

(24, 1)


In [None]:
# Check if the acceptances + rejections match the number of applications
assert (
    roster.shape[0] + rejections.shape[0] == application_df.shape[0]
), "Mismatched roster size"

In [None]:
# Check a particular project
project_to_comp = "Kids First Chicago"

list_of_not_assigned = list(
    assignment_df.loc[(assignment_df.Ranking.isna()), "Email Address"]
)
prefs_df = data_to_process_dict["ranking"].copy()
unmatched_prefs = prefs_df.loc[
    (prefs_df.loc[:, "Project Name"] == project_to_comp)
    & prefs_df.loc[:, "Email Address"].isin(list_of_not_assigned)
    & (prefs_df.Ranking < 100),  # noqa: PLR2004
    :,
]

unmatched_info = unmatched_prefs.merge(
    assignment_df.loc[:, ["Email Address", "Experienced", "Priority"]],
    how="left",
    on="Email Address",
)

matched = assignment_df.loc[
    (assignment_df.loc[:, "Project Assigned"] == project_to_comp), :
]
display(matched.sort_values("Ranking"))

display(unmatched_info)

Unnamed: 0,Email Address,Priority,Experienced,Project Assigned,Ranking
18,ddanaie@uchicago.edu,1,False,Kids First Chicago,0.0
64,rdrodriguez@uchicago.edu,1,True,Kids First Chicago,0.0
56,mraheem@uchicago.edu,4,False,Kids First Chicago,1.0
63,rcho@uchicago.edu,1,False,Kids First Chicago,4.0


Unnamed: 0,Email Address,Project Name,Ranking,Experienced,Priority
0,dliu1719@uchicago.edu,Kids First Chicago,3.0,False,4
1,wanniyang@uchicago.edu,Kids First Chicago,1.0,False,5
2,xinwenh@uchicago.edu,Kids First Chicago,2.0,False,4
3,yufeil@uchicago.edu,Kids First Chicago,4.0,False,4
4,zekaishen@uchicago.edu,Kids First Chicago,5.0,False,4


In [207]:
# Priority students that were not assigned to a project
assignment_df[
    (assignment_df["Priority"] == 1)
    & (assignment_df["Project Assigned"].isna())
]

Unnamed: 0,Email Address,Priority,Experienced,Project Assigned,Ranking


In [208]:
# Non-priority students that were assigned to a project
assignment_df[
    (assignment_df["Project Assigned"].notna())
    & (assignment_df["Priority"] > 1)
]

Unnamed: 0,Email Address,Priority,Experienced,Project Assigned,Ranking
4,arkadeep@uchicago.edu,2,False,RAFI - Grocery,1.0
6,beichiwu@uchicago.edu,2,True,Food System 6,3.0
8,caitlinp@uchicago.edu,2,True,CMAP,3.0
12,charliewang436@uchicago.edu,3,False,Internet Equity Initiative,1.0
14,cyang19@uchicago.edu,4,False,Climate Cabinet,1.0
15,daniel.li3002@gmail.com,2,True,Center for Living Systems,3.0
24,francyhsu@uchicago.edu,2,False,Groundwork Bridgeport,1.0
25,frankvasquez7@uchicago.edu,2,False,Internet Equity Initiative,3.0
26,ftani@uchicago.edu,2,True,Groundwork Bridgeport,1.0
28,gdiaz0618@uchicago.edu,3,False,DSI - Agents,1.0


In [None]:
# Excluded students that were assigned to a project
assignment_df[
    (assignment_df["Project Assigned"].notna())
    & (assignment_df["Priority"] == 5)  # noqa: PLR2004
]

Unnamed: 0,Email Address,Priority,Experienced,Project Assigned,Ranking


In [210]:
n_return = assignment_df[
    (assignment_df["Project Assigned"].notna())
    & (assignment_df["Ranking"] == 0)
].shape[0]
n_priority = assignment_df[(assignment_df["Priority"] == 1)].shape[0]
n_placed = assignment_df[(assignment_df["Project Assigned"].notna())].shape[0]

print(f"{n_return} returning students")
print(f"{n_priority} priority students")
print(f"{n_placed} total placed students")

22 returning students
37 priority students
68 total placed students


In [211]:
# All students that were assigned
assignment_df[(assignment_df["Project Assigned"].notna())][
    ["Email Address", "Project Assigned"]
]

Unnamed: 0,Email Address,Project Assigned
0,acherman@uchicago.edu,CMAP
1,akim03@uchicago.edu,IDI
3,alexmosher@uchicago.edu,Argonne
4,arkadeep@uchicago.edu,RAFI - Grocery
5,athuler@uchicago.edu,DSI - Agents
...,...,...
77,vikramr2025@uchicago.edu,IDI
78,vlois@uchicago.edu,DSI - HPC
86,yuchen2@uchicago.edu,Climate Cabinet
88,yurouli@uchicago.edu,Food System 6


In [212]:
# Check specific student(s) by email
assignment_df[
    assignment_df["Email Address"].isin(
        [
            "wanniyang@uchicago.edu",
            "kbarbarossa@uchicago.edu",
            "nalinb@uchicago.edu",
        ]
    )
]

Unnamed: 0,Email Address,Priority,Experienced,Project Assigned,Ranking
47,kbarbarossa@uchicago.edu,3,False,Data and Democracy,0.0
58,nalinb@uchicago.edu,5,True,,
79,wanniyang@uchicago.edu,5,False,,


In [213]:
roster["Degree Program"].value_counts()

Degree Program
Undergrad: 4th year                                   39
MA or MS 2nd year                                     20
Undergrad: 3rd year                                    6
MA or MS 1st year                                      2
Undergrad 4th Year Bx/MS - BS CAAM, BA Stats, MPCS     1
Name: count, dtype: int64

In [214]:
roster["Concentration"].value_counts()

Concentration
Data Science                                            41
MS Computational Analysis and Public Policy (MSCAPP)    11
MA Public Policy (MPP)                                   5
MA Computational Social Science (MACSS)                  3
MS Computer Science (MPCS)                               2
MS Computational and Applied Mathematics                 1
Computer Science                                         1
Cognitive Science                                        1
Physics, Data Science                                    1
MCAM                                                     1
Statistics                                               1
Name: count, dtype: int64

In [219]:
pd.crosstab(roster["Concentration"], roster["Degree Program"])

Degree Program,MA or MS 1st year,MA or MS 2nd year,"Undergrad 4th Year Bx/MS - BS CAAM, BA Stats, MPCS",Undergrad: 3rd year,Undergrad: 4th year
Concentration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cognitive Science,0,0,0,0,1
Computer Science,0,0,0,0,1
Data Science,0,0,0,6,35
MA Computational Social Science (MACSS),0,3,0,0,0
MA Public Policy (MPP),0,5,0,0,0
MCAM,1,0,0,0,0
MS Computational Analysis and Public Policy (MSCAPP),0,11,0,0,0
MS Computational and Applied Mathematics,0,1,0,0,0
MS Computer Science (MPCS),1,0,1,0,0
"Physics, Data Science",0,0,0,0,1
