# Content-based Recommendation

This notebook executes the experiments for Content-based Recommendation, which is only performed without reference classes.

The experiments are separated into 4 versions.

Note: The code in this notebook for the different versions only differs by the conf dictionary used.

In [None]:
%load_ext autoreload
%autoreload 2

import datetime
import numpy as np
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath('../../sources'))

import config
import training_general
import training_without_rc
import utils
from data_preparation import determine_reference_classes

In [None]:
# read data
df = utils.read_data_file("final_data_main_approach.csv")
df_orig = df.copy()
print(df.shape)

### Version 1: DTC

In [None]:
def get_conf_version1(filename_suffix: str) -> dict:
    return {
        "lim": [0.3, 0.5, 0.7, config.LimType.DYNAMIC],
        "eval_groups": ["info_cols", "reg_metrics", "class_metrics"],
        "reg_metrics": [config.RegMetrics.MAE, config.RegMetrics.MSE],
        "class_metrics": [
            config.ClassMetrics.ACC,
            config.ClassMetrics.F1,
            config.ClassMetrics.PREC,
            config.ClassMetrics.REC,
        ],
        "info_cols": [
            config.InfoCols.NUM_UT_PROBS,
            config.InfoCols.NUM_IU_PROBS,
            config.InfoCols.MEAN_UT_PERF,
            config.InfoCols.MEAN_IU_PERF,
        ],
        "method": config.RecMethod.CB,
        "with_ref_class": False,
        "models": [
            {"model_type": config.CBModelType.DTC, "used_columns": "v1"},
            {"model_type": config.CBModelType.DTC, "used_columns": "v1", "max_depth": 3},
            {"model_type": config.CBModelType.DTC, "used_columns": "v2"},
            {"model_type": config.CBModelType.DTC, "used_columns": "v2", "max_depth": 3},
        ],
        "saving_file": {
            "folder": "content_based_recommendation",
            "filename": "version1",
            "filename_suffix": filename_suffix,
        },
    }

save_file = True

In [None]:
conf = get_conf_version1(filename_suffix="")

df = df_orig.copy()

# check validity of conf dictionary
training_general.check_conf(conf, save_file=save_file)

with_rc = conf["with_ref_class"]

# prepare df
df = training_general.prepare_df(conf, df)

# get dictionary with reference classes
class_to_reference_class = determine_reference_classes.get_reference_classes(df)
print(len(class_to_reference_class))
# it is not used for the reference classes but to know which classes and test sequences are evaluated

# create dataframes
df, ass_seq, _ = training_general.create_dataframes(df)

In [None]:
# create empty predictions dataframe for complete training
if with_rc:
    raise NotImplementedError
else:
    index = training_without_rc.get_idx_pred_df(class_to_reference_class)
pred_df = training_general.initialize_pred_df(index=index, conf=conf)

count = 0
# count = 1550
# for cid, cid_dict in list(class_to_reference_class.items())[:3]:
for cid, cid_dict in class_to_reference_class.items():
    # for cid in ["2JFV80TTBO"]:
    # cid_dict = class_to_reference_class[cid]
    # print(f"----------- Class {cid} ------------")

    # make predictions for cid, evaluate and store evaluation results
    if with_rc:
        raise NotImplementedError
    else:
        pred_df.loc[cid] = (
            training_without_rc.perform_predictions_for_cid(
                conf, cid, cid_dict, df, ass_seq
            )
            .reindex(pred_df.loc[cid].index)
            .to_numpy()
        )

    count += 1
    if count % 10 == 0:
        d = datetime.datetime.now()
        print(f"{count} classes completed, last cid: {cid}, time: {d}")

In [None]:
# drop rows only containing nans
# only necessary if part of classes is trained
pred_df = pred_df.dropna(subset=["y_true"])
print(len(pred_df))

# save predictions
utils.save_predictions(pred_df, conf, save_idx=True)

# evaluate predictions and save
training_general.evaluate_predictions_and_save(pred_df, conf)

### Version 2: KNN, LinReg, SVC

In [None]:
def get_conf_version2(filename_suffix: str) -> dict:
    return {
        "lim": [0.3, 0.5, 0.7, config.LimType.DYNAMIC],
        "eval_groups": ["info_cols", "reg_metrics", "class_metrics"],
        "reg_metrics": [config.RegMetrics.MAE, config.RegMetrics.MSE],
        "class_metrics": [
            config.ClassMetrics.ACC,
            config.ClassMetrics.F1,
            config.ClassMetrics.PREC,
            config.ClassMetrics.REC,
        ],
        "info_cols": [
            config.InfoCols.NUM_UT_PROBS,
            config.InfoCols.NUM_IU_PROBS,
            config.InfoCols.MEAN_UT_PERF,
            config.InfoCols.MEAN_IU_PERF,
        ],
        "method": config.RecMethod.CB,
        "with_ref_class": False,
        "models": [
            {"model_type": config.CBModelType.KNN, "used_columns": "v2", "k": 3},
            {"model_type": config.CBModelType.KNN, "used_columns": "v2", "k": 5},
            {"model_type": config.CBModelType.LINREG, "used_columns": "v1"},
            {"model_type": config.CBModelType.LINREG, "used_columns": "v2"},
            {"model_type": config.CBModelType.SVC, "used_columns": "v2"},
        ],
        "saving_file": {
            "folder": "content_based_recommendation",
            "filename": "version2",
            "filename_suffix": filename_suffix,
        },
    }

save_file = True

In [None]:
conf = get_conf_version2(filename_suffix="")

df = df_orig.copy()

# check validity of conf dictionary
training_general.check_conf(conf, save_file=save_file)

with_rc = conf["with_ref_class"]

# prepare df
df = training_general.prepare_df(conf, df)

# get dictionary with reference classes
class_to_reference_class = determine_reference_classes.get_reference_classes(df)
print(len(class_to_reference_class))
# it is not used for the reference classes but to know which classes and test sequences are evaluated

# create dataframes
df, ass_seq, _ = training_general.create_dataframes(df)

In [None]:
# create empty predictions dataframe for complete training
if with_rc:
    raise NotImplementedError
else:
    index = training_without_rc.get_idx_pred_df(class_to_reference_class)
pred_df = training_general.initialize_pred_df(index=index, conf=conf)

count = 0
# count = 1550
# for cid, cid_dict in list(class_to_reference_class.items())[:30]:
for cid, cid_dict in class_to_reference_class.items():
    # for cid in ["EGEHUE9HG"]:
    # cid_dict = class_to_reference_class[cid]
    # print(f"----------- Class {cid} ------------")

    # make predictions for cid, evaluate and store evaluation results
    if with_rc:
        raise NotImplementedError
    else:
        pred_df.loc[cid] = (
            training_without_rc.perform_predictions_for_cid(
                conf, cid, cid_dict, df, ass_seq
            )
            .reindex(pred_df.loc[cid].index)
            .to_numpy()
        )

    count += 1
    if count % 10 == 0:
        d = datetime.datetime.now()
        print(f"{count} classes completed, last cid: {cid}, time: {d}")


In [None]:
# drop rows only containing nans
# only necessary if part of classes is trained
pred_df = pred_df.dropna(subset=["y_true"])
print(len(pred_df))

# save predictions
utils.save_predictions(pred_df, conf, save_idx=True)

# evaluate predictions and save
training_general.evaluate_predictions_and_save(pred_df, conf)

### Version 3: RFC

In [None]:
def get_conf_version3(filename_suffix: str) -> dict:
    return {
        "lim": [0.3, 0.5, 0.7, config.LimType.DYNAMIC],
        "eval_groups": ["info_cols", "reg_metrics", "class_metrics"],
        "reg_metrics": [config.RegMetrics.MAE, config.RegMetrics.MSE],
        "class_metrics": [
            config.ClassMetrics.ACC,
            config.ClassMetrics.F1,
            config.ClassMetrics.PREC,
            config.ClassMetrics.REC,
        ],
        "info_cols": [
            config.InfoCols.NUM_UT_PROBS,
            config.InfoCols.NUM_IU_PROBS,
            config.InfoCols.MEAN_UT_PERF,
            config.InfoCols.MEAN_IU_PERF,
        ],
        "method": config.RecMethod.CB,
        "with_ref_class": False,
        "models": [
            {"model_type": config.CBModelType.RFC, "used_columns": "v1", "n_estimators": 10},
            {"model_type": config.CBModelType.RFC, "used_columns": "v1", "n_estimators": 25},
            {"model_type": config.CBModelType.RFC, "used_columns": "v2", "n_estimators": 10},
            {"model_type": config.CBModelType.RFC, "used_columns": "v2", "n_estimators": 25},
            {"model_type": config.CBModelType.RFC, "used_columns": "v2", "n_estimators": 10, "max_depth": 3},
            {"model_type": config.CBModelType.RFC, "used_columns": "v2", "n_estimators": 25, "max_depth": 3},
        ],
        "saving_file": {
            "folder": "content_based_recommendation",
            "filename": "version3",
            "filename_suffix": filename_suffix,
        },
    }

save_file = True

In [None]:
conf = get_conf_version3(filename_suffix="")

df = df_orig.copy()

# check validity of conf dictionary
training_general.check_conf(conf, save_file=save_file)

with_rc = conf["with_ref_class"]

# prepare df
df = training_general.prepare_df(conf, df)

# get dictionary with reference classes
class_to_reference_class = determine_reference_classes.get_reference_classes(df)
print(len(class_to_reference_class))
# it is not used for the reference classes but to know which classes and test sequences are evaluated

# create dataframes
df, ass_seq, _ = training_general.create_dataframes(df)

In [None]:
# create empty predictions dataframe for complete training
if with_rc:
    raise NotImplementedError
else:
    index = training_without_rc.get_idx_pred_df(class_to_reference_class)
pred_df = training_general.initialize_pred_df(index=index, conf=conf)


count = 0
#count = 1550
#for cid, cid_dict in list(class_to_reference_class.items())[:3]:
for cid, cid_dict in class_to_reference_class.items():
#for cid in ["2JFV80TTBO"]:
    #cid_dict = class_to_reference_class[cid]
    #print(f"----------- Class {cid} ------------")

    # make predictions for cid, evaluate and store evaluation results
    if with_rc:
        raise NotImplementedError
    else:
        pred_df.loc[cid] = (
            training_without_rc.perform_predictions_for_cid(
            conf, cid, cid_dict, df, ass_seq
        )
        .reindex(pred_df.loc[cid].index)
        .to_numpy()
    )
    
    count += 1
    if count % 10 == 0:
        d = datetime.datetime.now()
        print(f"{count} classes completed, last cid: {cid}, time: {d}")

In [None]:
# drop rows only containing nans
# only necessary if part of classes is trained
pred_df = pred_df.dropna(subset=["y_true"])
print(len(pred_df))

# save predictions
utils.save_predictions(pred_df, conf, save_idx=True)

# evaluate predictions and save
training_general.evaluate_predictions_and_save(pred_df, conf)

### Version 4: LogReg and XGBoost

In [None]:
def get_conf_version4(filename_suffix: str) -> dict:
    return {
        "lim": [0.3, 0.5, 0.7, config.LimType.DYNAMIC],
        "eval_groups": ["info_cols", "reg_metrics", "class_metrics"],
        "reg_metrics": [config.RegMetrics.MAE, config.RegMetrics.MSE],
        "class_metrics": [
            config.ClassMetrics.ACC,
            config.ClassMetrics.F1,
            config.ClassMetrics.PREC,
            config.ClassMetrics.REC,
        ],
        "info_cols": [
            config.InfoCols.NUM_UT_PROBS,
            config.InfoCols.NUM_IU_PROBS,
            config.InfoCols.MEAN_UT_PERF,
            config.InfoCols.MEAN_IU_PERF,
        ],
        "method": config.RecMethod.CB,
        "with_ref_class": False,
        "models": [
            {"model_type": config.CBModelType.LOGREG, "used_columns": "v2", "max_iter": 100},
            {"model_type": config.CBModelType.LOGREG, "used_columns": "v2", "max_iter": 300},
            {"model_type": config.CBModelType.XGB, "used_columns": "v1", "n_estimators": 50, "max_depth": 5, "lr": 0.1},
            {"model_type": config.CBModelType.XGB, "used_columns": "v1", "n_estimators": 25, "max_depth": 3, "lr": 0.1},
            {"model_type": config.CBModelType.XGB, "used_columns": "v2", "n_estimators": 50, "max_depth": 3, "lr": 0.1},
            {"model_type": config.CBModelType.XGB, "used_columns": "v2", "n_estimators": 25, "max_depth": 3, "lr": 0.01}
        ],
        "saving_file": {
            "folder": "content_based_recommendation",
            "filename": "version4",
            "filename_suffix": filename_suffix,
        },
    }

save_file = True

In [None]:
conf = get_conf_version4(filename_suffix="")

df = df_orig.copy()

# check validity of conf dictionary
training_general.check_conf(conf, save_file=save_file)

with_rc = conf["with_ref_class"]

# prepare df
df = training_general.prepare_df(conf, df)

# get dictionary with reference classes
class_to_reference_class = determine_reference_classes.get_reference_classes(df)
print(len(class_to_reference_class))
# it is not used for the reference classes but to know which classes and test sequences are evaluated

# create dataframes
df, ass_seq, _ = training_general.create_dataframes(df)

In [None]:
# create empty predictions dataframe for complete training
if with_rc:
    raise NotImplementedError
else:
    index = training_without_rc.get_idx_pred_df(class_to_reference_class)
pred_df = training_general.initialize_pred_df(index=index, conf=conf)


count = 0
#count = 1550
#for cid, cid_dict in list(class_to_reference_class.items())[:5]:
for cid, cid_dict in class_to_reference_class.items():
#for cid in ["2JFV80TTBO"]:
    #cid_dict = class_to_reference_class[cid]
    # print(f"----------- Class {cid} ------------")

    # make predictions for cid, evaluate and store evaluation results
    if with_rc:
        raise NotImplementedError
    else:
        pred_df.loc[cid] = (
            training_without_rc.perform_predictions_for_cid(
            conf, cid, cid_dict, df, ass_seq
        )
        .reindex(pred_df.loc[cid].index)
        .to_numpy()
    )
    
    count += 1
    if count % 10 == 0:
        d = datetime.datetime.now()
        print(f"{count} classes completed, last cid: {cid}, time: {d}")

In [None]:
# drop rows only containing nans
# only necessary if part of classes is trained
pred_df = pred_df.dropna(subset=["y_true"])
print(len(pred_df))

# save predictions
utils.save_predictions(pred_df, conf, save_idx=True)

# evaluate predictions and save
training_general.evaluate_predictions_and_save(pred_df, conf)