# Visualize combined student schedules for each advisor

Faculty need a quick way of summarizing busy and free times for the combination of students they advise

In [1]:
import polars as pl
import altair as alt
import pandas as pd
import re

## Read in CSVs

- 'Mtg Start' and 'Mtg Eng' are read in as strings
- [Chrono formats for `.to_time()` reference](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
(`%r` is a 12-hour time format with AM/PM)
- Polars really only supports UTF-8 text encoding for fast reads, so you don't need to specify that if that's what your file is
- Not doing a lazy read here since the files aren't too big
- **If the IDs were all integers, we would want to force them to be read as a string so we don't lose leading zeros**

### Student class schedule data

In [2]:
students_df = (pl.read_csv('./data/students_deidentified.csv')
          .with_columns(start=pl.col('Mtg Start').str.to_time('%r'),
                        end=pl.col('Mtg End').str.to_time('%r'))
          .drop(pl.col('Mtg Start','Mtg End'))
           )
students_df

StudentID,StudentName,Descr,Subject,Pat,start,end
str,str,str,str,str,time,time
"""062264a""","""Joseph Garner""","""INTRODUCTORY MECHANICS""","""PHYSICS""","""WF""",13:25:00,14:40:00
"""062264a""","""Joseph Garner""","""ACADEMIC WRITING""","""WRITING""","""WF""",15:05:00,16:20:00
"""062264a""","""Joseph Garner""","""MATRICES AND VECTORS""","""MATH""","""TTH""",10:05:00,11:20:00
"""062264a""","""Joseph Garner""","""INTRO TO SIGNALS AND SYSTEMS""","""ECE""","""F""",10:05:00,12:55:00
"""062264a""","""Joseph Garner""","""ADV TOPICS IN DEEP LEARNING""","""ECE""","""MW""",11:45:00,13:00:00
…,…,…,…,…,…,…
"""23a1c08""","""Judith Norris""","""INTRODUCTORY MECHANICS""","""PHYSICS""","""T""",13:30:00,15:30:00
"""23a1c08""","""Judith Norris""","""ACADEMIC WRITING""","""WRITING""","""TTH""",11:45:00,13:00:00
"""23a1c08""","""Judith Norris""","""ADV TOPICS IN DEEP LEARNING""","""ECE""","""MW""",15:05:00,16:20:00
"""23a1c08""","""Judith Norris""","""INTRO TO SIGNALS AND SYSTEMS""","""ECE""","""W""",10:05:00,12:55:00


### Student advisor assignments data

In [3]:
advisors_df = pl.read_csv('./data/advisors_deidentified.csv')
advisors_df

StudentID,StudentName,AdvisorName,AdvisorID
str,str,str,str
"""f1010e9""","""Misty Lee""","""Sheri Mosley""","""28db778"""
"""7d8a6c1""","""Rebecca Thomas""","""Sheri Mosley""","""28db778"""
"""3d66893""","""Karen Clark""","""Sheri Mosley""","""28db778"""
"""f151ba8""","""Maxwell Kirby""","""Sheri Mosley""","""28db778"""
"""d48c1d6""","""Maria Robertson""","""Sheri Mosley""","""28db778"""
…,…,…,…
"""fde1944""","""Gina Wolfe""","""Philip Gallagher""","""5cb893c"""
"""1315a19""","""Dalton Wu""","""Philip Gallagher""","""5cb893c"""
"""52fbc5f""","""Heidi Stafford""","""Philip Gallagher""","""5cb893c"""
"""13704ca""","""Neil Caldwell""","""Philip Gallagher""","""5cb893c"""


### Join the two dataframes

- Some students don't have an advisor – filling null AdvisorName with placeholder string
- StudentName shows up in both DataFrames so can drop one of them

In [4]:
students_advisors_df = (pl.DataFrame
          .join(students_df, advisors_df, on='StudentID', how='left')
          .with_columns(pl.col('AdvisorName').fill_null(pl.lit('No Advisor')))
          .drop(pl.col('StudentName_right'))
         )
students_advisors_df

StudentID,StudentName,Descr,Subject,Pat,start,end,AdvisorName,AdvisorID
str,str,str,str,str,time,time,str,str
"""062264a""","""Joseph Garner""","""INTRODUCTORY MECHANICS""","""PHYSICS""","""WF""",13:25:00,14:40:00,"""Henry Schroeder""","""e6d6592"""
"""062264a""","""Joseph Garner""","""ACADEMIC WRITING""","""WRITING""","""WF""",15:05:00,16:20:00,"""Henry Schroeder""","""e6d6592"""
"""062264a""","""Joseph Garner""","""MATRICES AND VECTORS""","""MATH""","""TTH""",10:05:00,11:20:00,"""Henry Schroeder""","""e6d6592"""
"""062264a""","""Joseph Garner""","""INTRO TO SIGNALS AND SYSTEMS""","""ECE""","""F""",10:05:00,12:55:00,"""Henry Schroeder""","""e6d6592"""
"""062264a""","""Joseph Garner""","""ADV TOPICS IN DEEP LEARNING""","""ECE""","""MW""",11:45:00,13:00:00,"""Henry Schroeder""","""e6d6592"""
…,…,…,…,…,…,…,…,…
"""23a1c08""","""Judith Norris""","""INTRODUCTORY MECHANICS""","""PHYSICS""","""T""",13:30:00,15:30:00,"""Angie Moon""","""e7e131b"""
"""23a1c08""","""Judith Norris""","""ACADEMIC WRITING""","""WRITING""","""TTH""",11:45:00,13:00:00,"""Angie Moon""","""e7e131b"""
"""23a1c08""","""Judith Norris""","""ADV TOPICS IN DEEP LEARNING""","""ECE""","""MW""",15:05:00,16:20:00,"""Angie Moon""","""e7e131b"""
"""23a1c08""","""Judith Norris""","""INTRO TO SIGNALS AND SYSTEMS""","""ECE""","""W""",10:05:00,12:55:00,"""Angie Moon""","""e7e131b"""


### Aside: these students don't have advisors

In [5]:
(students_advisors_df
 .filter(pl.col('AdvisorName') == "No Advisor")
 .select(pl.col('StudentID','StudentName')).unique()
)

StudentID,StudentName
str,str
"""a763934""","""Jim Fisher"""
"""5fe5193""","""Daniel Miller"""
"""9d04ed2""","""Caitlin Adams"""
"""a1964aa""","""Christian Stewart"""
"""1ef6826""","""Brendan Dodson"""
…,…
"""14aafd7""","""David Lewis"""
"""d8c13bf""","""Jacob Wells"""
"""6a2617f""","""Leslie Willis"""
"""55bd99b""","""Elizabeth Johnson"""


### Aside: These classes don't have start times

In [6]:
(students_advisors_df
 .filter(pl.col('start').is_null())
 .group_by('Subject','Descr')
 .agg(count = pl.col('StudentID').count())
 .sort('count', descending=True)
)

Subject,Descr,count
str,str,u32
"""EGR""","""RESEARCH PROJECTS IN EGR""",52
"""CEE""","""ENGINEERING THE PLANET""",10
"""MUSIC""","""INTRO GUITAR CLASS""",2
"""MUSIC""","""SYMPHONY ORCHESTRA""",2
"""MUSIC""","""FLUTE""",1
…,…,…
"""MUSIC""","""WIND SYMPHONY""",1
"""ISS""","""INFORMATION, SOCIETY & CULTURE""",1
"""MUSIC""","""SAXOPHONE""",1
"""MUSIC""","""MEET THE BEATLES AND THE 1960S""",1


### Replace Pat with weekday lists and explode the weekday lists into rows

- **Decide below whether you want the long or abbreviated forms of the weekdays in the vis**

### Construct dictionary to map day patterns to their corresponding lists of days

- *NOTE: Decide here whether you want the long or abbreviated forms of the weekdays in the vis*
- Going to use H instead of TH for Thursday
- Using Pandas Series since you can use slice notation on them for day ranges with dashes
- **The only advantage of this is that you are adapting to the real data and don't need to explicitly define the mappings ahead of time**

In [27]:
# days_version = "full"
days_version = "abbreviation"

days_letters = ['M', 'T', 'W', 'H', 'F']
days_list_full = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
days_list_abbrev = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri']

# Using Pandas Series since you can use slice notation on them for day ranges with dashes
if days_version.lower() == "full":
    classdays_series = pd.Series(dict(zip(days_letters, days_list_full)))
else:
    classdays_series = pd.Series(dict(zip(days_letters, days_list_abbrev)))

# Create a dictionary to map all day patterns to their corresponding lists of days
classdays_dict = {}
for day_pattern in students_advisors_df.get_column('Pat').unique().to_list():
    # There are some null day patterns
    if day_pattern is not None:
        # Thursday is the only day with a two-character abbreviation
        day_pattern_noTH = day_pattern.replace('TH','H')
        if '-' in day_pattern_noTH:
            # Handle patterns like 'M-W', 'T-H', etc.
            match = re.search(r'([A-Z])-([A-Z])', day_pattern_noTH)
            classdays_dict[day_pattern] = classdays_series[slice(match.group(1),match.group(2))].to_list()
        else:
            # Handle single day patterns like 'MWF', 'TTH', etc.
            classdays_dict[day_pattern] = classdays_series[list(day_pattern_noTH)].to_list()


classdays_dict

{'TH': ['Thurs'],
 'W': ['Wed'],
 'MF': ['Mon', 'Fri'],
 'T': ['Tues'],
 'MW': ['Mon', 'Wed'],
 'TTH': ['Tues', 'Thurs'],
 'F': ['Fri'],
 'M': ['Mon'],
 'MTTH': ['Mon', 'Tues', 'Thurs'],
 'WF': ['Wed', 'Fri'],
 'MWF': ['Mon', 'Wed', 'Fri']}

In [28]:
sa_w_weekdays = (students_advisors_df
           .with_columns(weekday=pl.col('Pat').replace_strict(classdays_dict))
           .explode('weekday')
         )
sa_w_weekdays

StudentID,StudentName,Descr,Subject,Pat,start,end,AdvisorName,AdvisorID,weekday
str,str,str,str,str,time,time,str,str,str
"""062264a""","""Joseph Garner""","""INTRODUCTORY MECHANICS""","""PHYSICS""","""WF""",13:25:00,14:40:00,"""Henry Schroeder""","""e6d6592""","""Wed"""
"""062264a""","""Joseph Garner""","""INTRODUCTORY MECHANICS""","""PHYSICS""","""WF""",13:25:00,14:40:00,"""Henry Schroeder""","""e6d6592""","""Fri"""
"""062264a""","""Joseph Garner""","""ACADEMIC WRITING""","""WRITING""","""WF""",15:05:00,16:20:00,"""Henry Schroeder""","""e6d6592""","""Wed"""
"""062264a""","""Joseph Garner""","""ACADEMIC WRITING""","""WRITING""","""WF""",15:05:00,16:20:00,"""Henry Schroeder""","""e6d6592""","""Fri"""
"""062264a""","""Joseph Garner""","""MATRICES AND VECTORS""","""MATH""","""TTH""",10:05:00,11:20:00,"""Henry Schroeder""","""e6d6592""","""Tues"""
…,…,…,…,…,…,…,…,…,…
"""23a1c08""","""Judith Norris""","""ADV TOPICS IN DEEP LEARNING""","""ECE""","""MW""",15:05:00,16:20:00,"""Angie Moon""","""e7e131b""","""Mon"""
"""23a1c08""","""Judith Norris""","""ADV TOPICS IN DEEP LEARNING""","""ECE""","""MW""",15:05:00,16:20:00,"""Angie Moon""","""e7e131b""","""Wed"""
"""23a1c08""","""Judith Norris""","""INTRO TO SIGNALS AND SYSTEMS""","""ECE""","""W""",10:05:00,12:55:00,"""Angie Moon""","""e7e131b""","""Wed"""
"""23a1c08""","""Judith Norris""","""MATRICES AND VECTORS""","""MATH""","""TTH""",08:30:00,09:45:00,"""Angie Moon""","""e7e131b""","""Tues"""


### Make time ranges in 5 min intervals out of start-end

- Also turn times into strings
- Just keep the rows we'll need for the visualization
- Decided to set `closed='left'` so don't include the end time in list

In [29]:
(sa_w_weekdays
 .with_columns(Time=pl.time_ranges("start", "end", interval='5m', closed='left'))
 .select(pl.col('StudentName','weekday','start','end','Time'))
 .head(3)
)

StudentName,weekday,start,end,Time
str,str,time,time,list[time]
"""Joseph Garner""","""Wed""",13:25:00,14:40:00,"[13:25:00, 13:30:00, … 14:35:00]"
"""Joseph Garner""","""Fri""",13:25:00,14:40:00,"[13:25:00, 13:30:00, … 14:35:00]"
"""Joseph Garner""","""Wed""",15:05:00,16:20:00,"[15:05:00, 15:10:00, … 16:15:00]"


### Explode takes lists and makes each element into one row

- Other columns get repeated
- Turn times into strings for visualization
    - Note: the "time" data type is not supported in Altair so we convert to string
    - It may seem a little strange, but the only datatype supported by Altair is a real datetime. It's kind of nice that Polars has a time datatype, but even Polars doesn't have dataframe.time methods, only dataframe.dt methods (which can't act on "time" type data), so it's perhaps not surprising that Altair doesn't support it
- Using the `.alias()` form in `.select()` since prefer this column order and can't put "positional" arguments after "keyword" ones

In [30]:
class_day_time_df = (
    sa_w_weekdays
     .with_columns(Time=pl.time_ranges("start", "end", interval='5m', closed='left'))
     .explode('Time')
     .select(pl.col('Time').dt.strftime("%H:%M"),
             pl.col('weekday').alias('Day'),
             pl.col('Descr').alias('Class'),
             pl.col('StudentName'),
             pl.col('AdvisorName')
            )
)

class_day_time_df

Time,Day,Class,StudentName,AdvisorName
str,str,str,str,str
"""13:25""","""Wed""","""INTRODUCTORY MECHANICS""","""Joseph Garner""","""Henry Schroeder"""
"""13:30""","""Wed""","""INTRODUCTORY MECHANICS""","""Joseph Garner""","""Henry Schroeder"""
"""13:35""","""Wed""","""INTRODUCTORY MECHANICS""","""Joseph Garner""","""Henry Schroeder"""
"""13:40""","""Wed""","""INTRODUCTORY MECHANICS""","""Joseph Garner""","""Henry Schroeder"""
"""13:45""","""Wed""","""INTRODUCTORY MECHANICS""","""Joseph Garner""","""Henry Schroeder"""
…,…,…,…,…
"""09:20""","""Thurs""","""MATRICES AND VECTORS""","""Judith Norris""","""Angie Moon"""
"""09:25""","""Thurs""","""MATRICES AND VECTORS""","""Judith Norris""","""Angie Moon"""
"""09:30""","""Thurs""","""MATRICES AND VECTORS""","""Judith Norris""","""Angie Moon"""
"""09:35""","""Thurs""","""MATRICES AND VECTORS""","""Judith Norris""","""Angie Moon"""


### Get a unique list of the times for visualization sorting

- The list of weekday names sets the column order in the visualization

In [31]:
times_list = (class_day_time_df
 .get_column('Time')
 .unique()
 .sort()
 .to_list()
)
print(times_list[:5])

if days_version.lower()=="abbreviation":
    days_list = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri']
else:
    days_list = ['Monday','Tuesday','Wednesday','Thursday','Friday']

[None, '08:30', '08:35', '08:40', '08:45']


### Get a unique list of advisors for choosing and/or looping

In [32]:
advisors_list = (class_day_time_df
 .get_column('AdvisorName')
 .unique()
 .sort()
 .to_list()
)
print(advisors_list[:5])
print('There are', len(advisors_list), 'advisors')

['Angie Moon', 'Beth Mcconnell', 'Brett Mejia', 'Bridget Holloway', 'Brittney Nichols']
There are 27 advisors


## Visualize

- I'm doing the grouping and aggregation in Polars so we won't run into the problem of too many data rows in Altair

In [33]:
advisor = advisors_list[1]

onlyHeat = (class_day_time_df
            .filter(pl.col('AdvisorName') == advisor)
            .group_by('AdvisorName','Day','Time')
            .agg(pl.col('Class').count().alias('Count'),
                 pl.col('StudentName').str.join('; ').alias('Students'))
            .plot.rect(
                x=alt.X('Day:O', axis = alt.Axis(labelAngle=0), # for some reason vertical labels by default
                          sort=days_list, 
                          scale=alt.Scale(domain=days_list), title=''),
                y=alt.Y('Time:O', 
                          title='time of day', 
                          scale=alt.Scale(domain=times_list),
                          axis=alt.Axis(labelOverlap='parity')), # default labelOverlap=false for this type of axis
                color = alt.Color('Count:Q', 
                                  scale=alt.Scale(scheme='blues'), 
                                  legend=alt.Legend(title='# students')),
                tooltip = ['Day','Time','Count','Students']
            ).properties(
                width=200,
                height=600,
                title=advisor
            )
           )
onlyHeat

## Faceted by advisor

- **Need Vegafusion for this since it's a lot more rows**
- Consider whether to filter out "No Advisor" since that gets the most counts, so the rest get lighter in their heatmap colors

In [34]:
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [35]:
onlyHeat = (class_day_time_df
            .filter(pl.col('AdvisorName')!='No Advisor')
            .group_by('AdvisorName','Day','Time')
            .agg(pl.col('Class').count().alias('Count'),
                 pl.col('StudentName').str.join('; ').alias('Students'))
            .plot.rect(
                x=alt.X('Day:O', axis = alt.Axis(labelAngle=0), 
                          sort=days_list, 
                          scale=alt.Scale(domain=tuple(days_list)), title=''),
                y=alt.Y('Time:O', 
                          title='time of day', 
                          scale=alt.Scale(domain=times_list),
                         axis=alt.Axis(labelOverlap='parity')),
                color = alt.Color('Count:Q', 
                                  scale=alt.Scale(scheme='blues'), 
                                  legend=alt.Legend(title='# students')),
                tooltip = ['Day','Time','Count','Students']
            ).properties(
                width=120,
                height=200,
                title=advisor
            ).facet(
                facet="AdvisorName",
                columns=5
            )
           )
onlyHeat

### At one point had trouble with doubled-up students in the tooltip

- Saw that there were multiple classes for one person during that time! What would lead to that...? There must be some issue during the class swap. I feel like that's the most likely...
- Fixed it by joining class descriptions for permuted ones on both Subject and Descr. Still not completely sure why that fixed it...

In [36]:
advisor = advisors_list[2]

(class_day_time_df
    .filter(pl.col('AdvisorName')==advisor)
    .group_by('AdvisorName','Day','Time','StudentName','Class')
    .agg(pl.col('Class').count().alias('Count'),
         pl.col('StudentName').str.join('; ').alias('Students')
        )
    .filter((pl.col('Time')=='12:10') & (pl.col('Day')=="Wed"))
    .sort('StudentName')
)

AdvisorName,Day,Time,StudentName,Class,Count,Students
str,str,str,str,str,u32,str
"""Brett Mejia""","""Wed""","""12:10""","""Justin Young""","""INTERMEDIATE MECHANICS""",1,"""Justin Young"""
"""Brett Mejia""","""Wed""","""12:10""","""Laura Howe""","""INTERMEDIATE MECHANICS""",1,"""Laura Howe"""
"""Brett Mejia""","""Wed""","""12:10""","""Melissa Hurley""","""INTERMEDIATE MECHANICS""",1,"""Melissa Hurley"""
"""Brett Mejia""","""Wed""","""12:10""","""Shawna Gregory""","""INTERMEDIATE MECHANICS""",1,"""Shawna Gregory"""
"""Brett Mejia""","""Wed""","""12:10""","""Sherry Garcia""","""INTERMEDIATE MECHANICS""",1,"""Sherry Garcia"""
"""Brett Mejia""","""Wed""","""12:10""","""Tammy Martin""","""INTERMEDIATE MECHANICS""",1,"""Tammy Martin"""
"""Brett Mejia""","""Wed""","""12:10""","""Tony Salas""","""INTERMEDIATE MECHANICS""",1,"""Tony Salas"""
"""Brett Mejia""","""Wed""","""12:10""","""Valerie White""","""ACADEMIC WRITING""",1,"""Valerie White"""
