# Restrict Reference Classes based on CF

In the file `determine_reference_classes.py` a function is provided that delivers all possible reference classes for a class and UT sequence. Some experiments are performed for all of those classes. For computational reasons, this cannot be done for all approaches.

Hence, in this notebook we restricted the possible reference classes to one reference class per class and UT sequence. A reference class is selected randomly (as described in Section 4.5 of the report). The corresponding class - UT sequence - reference class combinations are stored in the file `ref_classes_restricted_random.csv` in the `data` folder. We refer to those reference classes as restricted random reference classes.

Furthermore, other choices of reference classes might be interesting to analyze. Therefore, we tested some approaches on a restricted set of reference classes. For each target class and UT sequence the following choices of reference classes are determined:
- a random reference class (the same as is stored in `ref_classes_restricted_random.csv`)
- the reference classes that performed best and worst in terms of accuracy and f1 score, respectively: We consider the experimental results from the CF approach "knn_sim_manhattan_pred_resnick_k_3" which was tested for all possible reference classes. We grouped by class, UT sequence and reference class and computed the mean of accuracy or f1 score across all students. Then for each class and UT sequence we chose the reference class that achieved the best or worst mean performance.
- a random reference class with 5-10 students, with 10-20 students and with more than 20 students

Those extended restricted reference classes are stored in `ref_classes_restricted.csv` in the `data` folder. 

There is an additional restriction to ensure, that the reference class works for most students in the target class: Possible reference classes have to appear most often in class - UT sequence combinations. That is, for each class - UT sequence combination have a look at for how many students the reference class can produce predictions. Then restrict to those reference classes that fulfill the maximum number.

The dictionary containing all possible reference classes can then be transformed to a dictionary only containing the reference classes specified in a `csv` file by using the function `restrict_c2rc_dict()` in `determine_reference_classes.py`.

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath("../../sources"))

import config
import utils
from data_preparation import determine_reference_classes

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SAVE_PRIMARY_FILES = False
SAVE_SECONDARY_FILES = False

In [3]:
def read_cf_eval(method: str, pred: str, k: int):
    eval_comp = pd.DataFrame()
    for part in [1,2,3,4]:
        filename = f"{method}_sim_manhattan_pred_{pred}_k_{k}_part{part}"
        eval_df = utils.read_evaluation_df(
            "collaborative_filtering", filename, latest=True
        )
        eval_df = eval_df.set_index(["class_id", "ut_id", "student_id", "ref_class"]).round(
            config.ROUND_DECIMALS
        )
        eval_comp = pd.concat([eval_comp, eval_df])
    eval_comp = eval_comp.drop(columns=["y_true", "y_pred"])
    return eval_comp

In [4]:
eval_df = read_cf_eval("knn", "resnick", 3)

Read file knn_sim_manhattan_pred_resnick_k_3_part1_20240731_003317.csv
Read file knn_sim_manhattan_pred_resnick_k_3_part2_20240731_071418.csv
Read file knn_sim_manhattan_pred_resnick_k_3_part3_20240801_102209.csv
Read file knn_sim_manhattan_pred_resnick_k_3_part4_20240801_181733.csv


In [5]:
# restrict eval_df to possible reference classes
# that are reference classes that work for a maximum number of students
num_stud_per_rc = eval_df.groupby(["class_id", "ut_id", "ref_class"]).size()
max_num_stud = num_stud_per_rc.groupby(["class_id", "ut_id"]).transform("max")
rc_cand = num_stud_per_rc[num_stud_per_rc == max_num_stud].index
eval_df = eval_df.reset_index().set_index(["class_id", "ut_id", "ref_class"])
eval_df = eval_df.loc[rc_cand].reset_index().set_index(["class_id", "ut_id", "ref_class", "student_id"])
eval_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,num_ut_probs,num_iu_probs,mean_ut_perf,mean_iu_perf,num_stud_rc,max_num_iu_probs_rc,mean_iu_perf_rc,mean_ut_perf_rc,mae,mse,...,precision_lim_50,recall_lim_50,acc_lim_70,f1_lim_70,precision_lim_70,recall_lim_70,acc_lim_dynamic,f1_lim_dynamic,precision_lim_dynamic,recall_lim_dynamic
class_id,ut_id,ref_class,student_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
100VH25818,1JP23ZDV6R,1M6QPX9MCM,14P82837Z2,23,1,0.7391,0.0000,4,1,0.5000,0.7609,0.4130,0.2935,...,0.8750,0.4118,0.5217,0.5600,0.8750,0.4118,0.6522,0.7778,0.7368,0.8235
100VH25818,1JP23ZDV6R,1M6QPX9MCM,1D3TNZ3SWY,23,7,0.7391,0.8571,5,7,0.7714,0.7609,0.3292,0.1503,...,0.7727,1.0000,0.6957,0.7586,0.9167,0.6471,0.2609,0.0000,0.0000,0.0000
100VH25818,1JP23ZDV6R,1M6QPX9MCM,1GCNU4RVTR,5,7,0.4000,0.8571,5,7,0.7714,0.6333,0.3696,0.1653,...,0.5000,1.0000,0.8000,0.6667,1.0000,0.5000,0.6000,0.0000,0.0000,0.0000
100VH25818,1JP23ZDV6R,1M6QPX9MCM,3AFAKYP4D,23,21,1.0000,0.9524,6,21,0.7937,0.7609,0.0660,0.0288,...,1.0000,0.9565,0.9565,0.9778,1.0000,0.9565,0.8261,0.9048,1.0000,0.8261
100VH25818,1JP23ZDV6R,1M6QPX9MCM,UAMN5USA,23,1,1.0000,0.0000,4,1,0.5000,0.7609,0.4130,0.2935,...,1.0000,0.3478,0.3478,0.5161,1.0000,0.3478,0.8261,0.9048,1.0000,0.8261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZY24QW3NN,2DRIUYX4MA,QPLQTOX8B,G7X3X8X97,17,23,0.5882,0.6957,8,9,0.2778,0.5706,0.3692,0.2878,...,0.6250,1.0000,0.5882,0.6957,0.6154,0.8000,0.5882,0.6957,0.6154,0.8000
ZY24QW3NN,2DRIUYX4MA,QPLQTOX8B,GKHQ3B0QN,17,23,0.5882,0.4783,8,9,0.2778,0.5706,0.3675,0.2319,...,0.6000,0.9000,0.7059,0.7368,0.7778,0.7000,0.5882,0.7200,0.6000,0.9000
ZY24QW3NN,2DRIUYX4MA,QPLQTOX8B,TYR869IGO,17,14,0.5882,0.4286,8,6,0.3125,0.5706,0.3149,0.1759,...,0.7692,1.0000,0.8235,0.8571,0.8182,0.9000,0.7059,0.8000,0.6667,1.0000
ZY24QW3NN,2DRIUYX4MA,QPLQTOX8B,U9SDH1Q5D,17,21,0.7647,0.3333,8,7,0.2589,0.5706,0.3386,0.1737,...,1.0000,0.5385,0.6471,0.7000,1.0000,0.5385,0.8235,0.8889,0.8571,0.9231


In [6]:
# random reference class
np.random.seed(10)
random_rc = (
    eval_df.groupby(["class_id", "ut_id"], group_keys=False)
    .apply(lambda x: x.sample(n=1))
    .index.droplevel("student_id")
)
# random_rc

In [7]:
def get_best_and_worst_rc(
    eval_df: pd.DataFrame, met_col: str
) -> tuple[pd.Series, pd.Series]:
    mean_per_combi = eval_df.groupby(["class_id", "ut_id", "ref_class"])[met_col].mean()
    return mean_per_combi.groupby(
        ["class_id", "ut_id"]
    ).idxmax(), mean_per_combi.groupby(["class_id", "ut_id"]).idxmin()

# ref classes with best and worst f1 scores
best_f1, worst_f1 = get_best_and_worst_rc(eval_df, "f1_lim_50")
# ref classes with best and worst acc scores
best_acc, worst_acc = get_best_and_worst_rc(eval_df, "acc_lim_50")

In [8]:
# get unique number of students per reference class (use max)
eval_df2 = eval_df.copy()
eval_df2["num_stud_rc"] = eval_df2.groupby(["class_id", "ut_id", "ref_class"])["num_stud_rc"].transform("max")
eval_df2 = eval_df2.reset_index().drop_duplicates(subset=["class_id", "ut_id", "ref_class"]).set_index(["class_id", "ut_id", "ref_class"])

# get categories
num_rc_cut = [0, 4, 10, 20, max(eval_df["num_stud_rc"])]
eval_df2["num_stud_rc_cut"] = pd.cut(
    eval_df2["num_stud_rc"],
    bins=num_rc_cut,
    include_lowest=True,
)
eval_rest = eval_df2[
    eval_df2["num_stud_rc_cut"] != pd.Interval(left=-0.001, right=4, closed="right")
]

# get one combination per category
np.random.seed(11)
rc_num_stud = (
    eval_rest.groupby(
        ["class_id", "ut_id", "num_stud_rc_cut"],
        observed=True,
        group_keys=False,
    )
    .apply(lambda x: x.sample(n=1), include_groups=False)
    .index
)

In [9]:
all_combis = list(
    set(best_f1)
    | set(best_acc)
    | set(worst_f1)
    | set(worst_acc)
    | set(random_rc)
    | set(rc_num_stud)
)
all_combis.sort()
all_combis = pd.DataFrame(data=all_combis, columns=["class_id", "ut_id", "ref_class"])
len(all_combis)

13311

In [10]:
if SAVE_PRIMARY_FILES:
    utils.save_as_csv(all_combis, "ref_classes_restricted.csv", save_idx=False)

In [11]:
# random reference classes
random_rc = pd.DataFrame(data=list(random_rc), columns=["class_id", "ut_id", "ref_class"])
if SAVE_PRIMARY_FILES:
    utils.save_as_csv(random_rc, "ref_classes_restricted_random.csv", save_idx=False)

In [12]:
# best f1
best_f1 = pd.DataFrame(data=list(best_f1), columns=["class_id", "ut_id", "ref_class"])

# worst f1
worst_f1 = pd.DataFrame(data=list(worst_f1), columns=["class_id", "ut_id", "ref_class"])

# best acc
best_acc = pd.DataFrame(data=list(best_acc), columns=["class_id", "ut_id", "ref_class"])

# worst acc
worst_acc = pd.DataFrame(data=list(worst_acc), columns=["class_id", "ut_id", "ref_class"])

# rc num stud
rc_num_stud = pd.DataFrame(data=list(rc_num_stud), columns=["class_id", "ut_id", "ref_class"])


if SAVE_SECONDARY_FILES:
    utils.save_as_csv(best_f1, "ref_classes_restricted_best_f1.csv", save_idx=False)
    utils.save_as_csv(worst_f1, "ref_classes_restricted_worst_f1.csv", save_idx=False)
    utils.save_as_csv(best_acc, "ref_classes_restricted_best_acc.csv", save_idx=False)
    utils.save_as_csv(worst_acc, "ref_classes_restricted_worst_acc.csv", save_idx=False)
    utils.save_as_csv(rc_num_stud, "ref_classes_restricted_num_stud.csv", save_idx=False)

#### Test function for restricting c2rc dict

In [13]:
# read data
df = utils.read_data_file("final_data_main_approach.csv")
print(df.shape)

# get dictionary with reference classes
class_to_reference_class = determine_reference_classes.get_reference_classes(df)

# restrict dict
c2rc = determine_reference_classes.restrict_c2rc_dict(class_to_reference_class, "ref_classes_restricted.csv")

  return pd.read_csv(config.DATA_FOLDER / filename)


(2664573, 22)


In [14]:
num_rc_comp_total = 0
num_rc_total = 0
num_no_rc = 0

for cid, cid_dict in c2rc.items():
    num_rc_comp_total += len(cid_dict["ref_classes_complete"])
    for ts, ts_dict in cid_dict["details"].items():
        if not ts_dict["reference_classes"]:
            num_no_rc += 1
        else:
            num_rc_total += len(ts_dict["reference_classes"])
num_rc_comp_total, num_rc_total, num_no_rc

(12691, 13342, 11)