# Test Running Code in this Repo

This file is set up to test if running code in this repo works, particularly if all data required for the individual steps is available.

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime

import sys
import os
sys.path.append(os.path.abspath('../sources'))

import config
import training_general
import training_without_rc
import utils
from data_preparation import determine_reference_classes

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


### Test General Functionality

Only the data already stored in the `data` folder is necessary.

In [2]:
# test if data for executing the code in general is available
expert_bkt_probs = utils.read_data_file("expert_data_bkt_probs.csv")
print(len(expert_bkt_probs))

29813


### Test Files for Data Preparation

For executing the notebooks in `data_preparation`, all data files should be downloaded from https://www.kaggle.com/competitions/edm-cup-2023/data and stored in the `data` folder.

In [3]:
# test if data for executing the code in general is available
df = utils.read_problem_details()
print(len(df))

132738


### Test Executing Experiments

The file `final_data_main_approach.csv` should be downloaded from the Google Drive folder and stored in the `data` folder.

This additionally requires the following files from https://www.kaggle.com/competitions/edm-cup-2023/data to be stored in the `data` folder:
- `assignment_relationships.csv`
- `problem_details.csv`
- `sequence_details.csv`

In addition, make sure to create a folder `results` at the same level as the `data` folder and inside this folder another folder `content_based_recommendation`.

This step takes approx. 1 minute.

In [4]:
# read data
df = utils.read_data_file("final_data_main_approach.csv")
df_orig = df.copy()
print(df.shape)

  return pd.read_csv(config.DATA_FOLDER / filename)


(2664573, 22)


In [5]:
def get_conf_version1(filename_suffix: str) -> dict:
    return {
        "lim": [0.5],
        "eval_groups": ["info_cols", "class_metrics"],
        "reg_metrics": [],
        "class_metrics": [
            config.ClassMetrics.ACC,
            config.ClassMetrics.F1,
        ],
        "info_cols": [
            config.InfoCols.NUM_UT_PROBS,
            config.InfoCols.NUM_IU_PROBS,
            config.InfoCols.MEAN_UT_PERF,
            config.InfoCols.MEAN_IU_PERF,
        ],
        "method": config.RecMethod.CB,
        "with_ref_class": False,
        "models": [
            {"model_type": config.CBModelType.DTC, "used_columns": "v1"},
        ],
        "saving_file": {
            "folder": "content_based_recommendation",
            "filename": "test_file",
            "filename_suffix": filename_suffix,
        },
    }

save_file = True

In [6]:
conf = get_conf_version1(filename_suffix="test_code")

df = df_orig.copy()

# check validity of conf dictionary
training_general.check_conf(conf, save_file=save_file)

with_rc = conf["with_ref_class"]

# prepare df
df = training_general.prepare_df(conf, df)

# get dictionary with reference classes
class_to_reference_class = determine_reference_classes.get_reference_classes(df)
print(len(class_to_reference_class))
# it is not used for the reference classes but to know which classes and test sequences are evaluated

# create dataframes
df, ass_seq, _ = training_general.create_dataframes(df)

1523


In [7]:
# create empty predictions dataframe for complete training
if with_rc:
    raise NotImplementedError
else:
    index = training_without_rc.get_idx_pred_df(class_to_reference_class)
pred_df = training_general.initialize_pred_df(index=index, conf=conf)

count = 0
for cid, cid_dict in list(class_to_reference_class.items())[:5]:
    print(f"----------- Class {cid} ------------")
    # make predictions for cid, evaluate and store evaluation results
    if with_rc:
        raise NotImplementedError
    else:
        pred_df.loc[cid] = (
            training_without_rc.perform_predictions_for_cid(
                conf, cid, cid_dict, df, ass_seq
            )
            .reindex(pred_df.loc[cid].index)
            .to_numpy()
        )

    count += 1
    if count % 10 == 0:
        d = datetime.datetime.now()
        print(f"{count} classes completed, last cid: {cid}, time: {d}")

----------- Class 2JFV80TTBO ------------
----------- Class C4EIV9P0E ------------
----------- Class EGEHUE9HG ------------
----------- Class 1FN3UGSKCC ------------
----------- Class D3EXBNF3N ------------


In [8]:
# drop rows only containing nans
# only necessary if part of classes is trained
pred_df = pred_df.dropna(subset=["y_true"])
print(len(pred_df))

# save predictions
#utils.save_predictions(pred_df, conf, save_idx=True)

# evaluate predictions and save
training_general.evaluate_predictions_and_save(pred_df, conf)

76
Start evaluating dtc_used_columns_v1 (2024-09-02 09:59:54.056966)
Saved evaluation df with filename dtc_used_columns_v1_test_code_20240902_095954.csv in folder content_based_recommendation


### Test Reading Results Files

For testing this part, you have two options:
1. You can run the part before `Test Executing Experiments` which will automatically store results files in the `results` folder.
In this case, run the cell belonging to "Option 1".
2. Alternatively, the content of the result folder must be downloaded from Google Drive and stored in a folder `results` at the same level as the `data` folder. The `results` folder should follow the same structure as the respective folder at Google Drive, containing five folders, one for each method type.
For executing the cells below it is particularly required to store the file `results\content_based_recommendation\dtc_used_columns_v1_20240804_194156.csv`.
In this case, run the cell belonging to "Option 2".

In [9]:
# test reading results - Option 1
folder = "content_based_recommendation"
model = {"model_type": config.CBModelType.DTC, "used_columns": "v1"}
model_name = training_general.build_model_name(model)
print(model_name)

suff = "test_code"

eval_df = utils.read_evaluation_df(folder, f"{model_name}_{suff}", latest=True)
len(eval_df)

dtc_used_columns_v1
Read file dtc_used_columns_v1_test_code_20240902_095954.csv


76

In [10]:
# test reading results - Option 2
folder = "content_based_recommendation"
model = {"model_type": config.CBModelType.DTC, "used_columns": "v1"}
model_name = training_general.build_model_name(model)
print(model_name)

eval_df = utils.read_evaluation_df(folder, model_name, latest=True)
len(eval_df)

dtc_used_columns_v1
Read file dtc_used_columns_v1_20240804_194156.csv


30580