# Determine Reference Classes

This notebook implements Section 4.5 of the report. However, it is only for illustrating what is done in the file `determine_reference_classes.py` and no data is saved here.

For each UT sequence in each target class all possible reference classes are determined. A reference class rc for a target class cid and a UT sequence ts has to fulfill the following two requirements:
- At least one student in rc must have completed ts.
- At least one student in rc must have completed at least one IU sequence that belongs to ts and that was completed by at least one student in the cid.

The possible reference classes are stored together with some further information in a dictionary. The keys are all class_id's (cid) existing in df. The value for each of those is another dictionary with the following items:
- students: list of students (student_id's) in cid
- problems: list of all problems (problem_id's) completed by students in this cid
- details: dictionary, where the keys are the UT sequences that have been completed within the cid; the values are another dictionaries with the following items:
    - reference_classes: list of possible reference classes (class_id's) for this cid and UT sequence
    - iu_seq: list of IU sequences (sequence_id's) corresponding to the UT sequence
- ref_classes_complete: list of all reference class candidates for all ts in cid

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from itertools import chain

import sys
import os
sys.path.append(os.path.abspath('../../sources'))

import utils
from data_preparation import determine_reference_classes

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read data
df = utils.read_data_file("final_data_main_approach.csv")

  return pd.read_csv(config.DATA_FOLDER / filename)


In [3]:
# read assignment_relationships
assignment_relationships = utils.read_assignment_relationships()
assignment_relationships = assignment_relationships.rename(
    columns={
        "unit_test_assignment_log_id": "ut_id",
        "in_unit_assignment_log_id": "iu_id",
    }
)
# restrict assignment_relationships to ut assignments and iu assignments in df
ut_ass = df.loc[df["unit_test"] == 1, "assignment_log_id"].unique()
iu_ass = df.loc[df["unit_test"] == 0, "assignment_log_id"].unique()
assignment_relationships = assignment_relationships.loc[
    (assignment_relationships["ut_id"].isin(ut_ass))
    & (assignment_relationships["iu_id"].isin(iu_ass))
].reset_index(drop=True)

In [4]:
# get info on sequence_id, class_id and student_id per assignment
ass_info = df.drop_duplicates(subset="assignment_log_id").set_index("assignment_log_id")

# merge info to assignment_relationships for ut assignments
ass_seq = assignment_relationships.merge(
    ass_info[["sequence_id", "class_id", "student_id"]],
    how="left",
    left_on="ut_id",
    right_index=True,
)

# merge info to ass_seq for iu assignments
ass_seq = ass_seq.merge(
    ass_info[["sequence_id"]],
    how="left",
    left_on="iu_id",
    right_index=True,
    suffixes=["_ut", "_iu"],
).rename(
    columns={
        "ut_id": "ut_ass",
        "iu_id": "iu_ass",
        "sequence_id_ut": "ut_seq",
        "sequence_id_iu": "iu_seq",
        "class_id": "ut_class",
    }
)
ass_seq  # len=365903

Unnamed: 0,ut_ass,iu_ass,ut_seq,ut_class,student_id,iu_seq
0,38M6IA4SS,2DQG3SWWLS,CD76U7XEG,2JFV80TTBO,1VUKTJH0DS,2JJ7KO37DN
1,15XW17EHLW,Y3G0XTLMF,CD76U7XEG,C4EIV9P0E,PI712SWCZ,2JJ7KO37DN
2,2C5IG7FC12,1HLYER60XW,CD76U7XEG,C4EIV9P0E,70FNFVSRP,2JJ7KO37DN
3,F9OJCBCRM,1XB8H1OIF8,CD76U7XEG,EGEHUE9HG,11WHZOTNO5,2JJ7KO37DN
4,K6IE7PFC7,RIFQE6J73,CD76U7XEG,1FN3UGSKCC,40KA444QZ,2JJ7KO37DN
...,...,...,...,...,...,...
355232,2PNRH0FF5C,FKXJ6DJSD,2OMFIAVLSP,1QV504I70A,20WNQ1L59I,L1JX9SVN8
355233,2PNRH0FF5C,221C5JLHVL,2OMFIAVLSP,1QV504I70A,20WNQ1L59I,2H7PPQFXD1
355234,2PNRH0FF5C,5TT39WSHM,2OMFIAVLSP,1QV504I70A,20WNQ1L59I,1Q7ENQHCE1
355235,2PNRH0FF5C,1XVZ3E4Z91,2OMFIAVLSP,1QV504I70A,20WNQ1L59I,6XV73PRE3


We ensure that reference classes are only relevant classes by only considering the "ut_class", that is the class that corresponds to the ut assignment. Hence, the class of the iu assignment does not play a role.

In [5]:
# prepare helping dataframes
ut_seq_per_class = ass_seq.groupby("ut_class")["ut_seq"].apply(pd.unique)
iu_seq_per_cid_ts = ass_seq.groupby(["ut_class", "ut_seq"])["iu_seq"].apply(pd.unique)
ass_seq = ass_seq.set_index(["ut_seq", "iu_seq"])

# list of students per class
stud_per_class = (
    df.loc[df["unit_test"] == 1].groupby("class_id")["student_id"].unique().apply(list)
)

# list of problems for each student
prob_per_stud = df.groupby("student_id")["problem_id"].unique()

In [6]:
do_print = False
class_to_reference_class = {}
classes_wo_reference_class = []

for cid in ass_seq["ut_class"].unique():
    if do_print:
        print("Class ID:", cid)
    # initialize details dict
    details_dict = {}
    ref_classes_complete = set()

    # get list of unit test sequences
    test_sequences = ut_seq_per_class.loc[cid]
    if do_print:
        print("Completed test sequences:", test_sequences)

    for ts in test_sequences:
        if do_print:
            print("Test Sequence:", ts)
        # get list of in unit sequence ids belonging to cid and ts
        iu_seq = iu_seq_per_cid_ts[cid, ts]
        if do_print:
            print("Relevant In Unit Sequence IDs:", len(iu_seq), iu_seq)

        # get classes that worked on same ut sequence and on at least one same iu sequence
        reference_classes = ass_seq.loc[ts].loc[iu_seq, "ut_class"].unique()
        if len(reference_classes) > 1:
            reference_classes = [rfc for rfc in reference_classes if rfc != cid]
            ref_classes_complete.update(reference_classes)
            if do_print:
                print("Reference Classes:", len(reference_classes), reference_classes)
        else:
            reference_classes = None
            if do_print:
                print("Class has no reference classes")
        # fill in dict for test sequence
        details_dict[ts] = {"reference_classes": reference_classes, "iu_seq": iu_seq}
    
    # get list of students
    cid_studs = stud_per_class.loc[cid]
    # get list of problems of students in cid
    cid_probs = set(chain.from_iterable(prob_per_stud.loc[cid_studs]))

    # add class dict to complete dict if any reference class exists
    if len(ref_classes_complete) > 0:
        class_to_reference_class[cid] = {
            "students": cid_studs,
            "problems": list(cid_probs),
            "ref_classes_complete": list(ref_classes_complete),
            "details": details_dict
        }
    else:
        classes_wo_reference_class.append(cid)

In [7]:
count_ts_total = 0
count_ts_rc = 0
count_ts_no_rc = 0
count_cid_ts_no_rc = 0

for cid, cid_dict in class_to_reference_class.items():
    details_dict = cid_dict["details"]
    count_ts_total += len(details_dict.keys())
    cid_ts_has_no_rc = False
    for ts, ts_dict in details_dict.items():
        if not ts_dict["reference_classes"]:
            count_ts_no_rc += 1
            cid_ts_has_no_rc = True
        else:
            count_ts_rc += 1
    if cid_ts_has_no_rc:
        count_cid_ts_no_rc += 1

print("Num classes in class_to_reference_class:", len(class_to_reference_class))  # 1523
print("Num classes without any reference class:", len(classes_wo_reference_class))  # 9
print("Num classes with at least one ts without reference class:", count_cid_ts_no_rc)  # 8
print("Num test sequences total:", count_ts_total)  # 2351
print("Num test sequences with reference class:", count_ts_rc)  # 2340
print("Num test sequences without reference class:", count_ts_no_rc)  # 11

Num classes in class_to_reference_class: 1523
Num classes without any reference class: 9
Num classes with at least one ts without reference class: 8
Num test sequences total: 2351
Num test sequences with reference class: 2340
Num test sequences without reference class: 11


### Getting Reference Classes via Function

In [8]:
df = utils.read_data_file("final_data_main_approach.csv")
class_to_reference_class = determine_reference_classes.get_reference_classes(df)

  return pd.read_csv(config.DATA_FOLDER / filename)


In [9]:
count_ts_total = 0
count_ts_rc = 0
count_ts_no_rc = 0
count_cid_ts_no_rc = 0

for cid, cid_dict in class_to_reference_class.items():
    details_dict = cid_dict["details"]
    count_ts_total += len(details_dict.keys())
    cid_ts_has_no_rc = False
    for ts, ts_dict in details_dict.items():
        if not ts_dict["reference_classes"]:
            count_ts_no_rc += 1
            cid_ts_has_no_rc = True
        else:
            count_ts_rc += 1
    if cid_ts_has_no_rc:
        count_cid_ts_no_rc += 1

print("Num classes in class_to_reference_class:", len(class_to_reference_class))  # 1523
# print("Num classes without any reference class:", len(classes_wo_reference_class))  # 9
print("Num classes with at least one ts without reference class:", count_cid_ts_no_rc)  # 8
print("Num test sequences total:", count_ts_total)  # 2351
print("Num test sequences with reference class:", count_ts_rc)  # 2340
print("Num test sequences without reference class:", count_ts_no_rc)  # 11

Num classes in class_to_reference_class: 1523
Num classes with at least one ts without reference class: 8
Num test sequences total: 2351
Num test sequences with reference class: 2340
Num test sequences without reference class: 11
