In [1]:
import pandas as pd
import numpy as np 


DATA_DIR = '/content/gdrive/MyDrive/data' 
DATA_DIR = '../data'
STUDY_DIR = DATA_DIR + '/study'

users = pd.read_csv(f'{DATA_DIR}/users.csv.gz')
topics = pd.read_csv(f'{DATA_DIR}/topics_translated.csv')
documents = pd.read_csv(f'{DATA_DIR}/documents.csv.gz')

# use study for less data, for testing
events = pd.read_csv(f'{STUDY_DIR}/events.csv.gz')
transactions = pd.read_csv(f'{STUDY_DIR}/transactions.csv.gz')

topic_trees = pd.read_csv(f'{DATA_DIR}/topic_trees.csv.gz')

In [2]:
import matplotlib.pyplot as plt
import networkx as nx

math_topics = set(topics[topics['math']==1]['id'])
german_topics = set(topics[topics['math']==0]['id'])

topic_trees = topic_trees[~topic_trees['parent_id'].isna()]

german_row = {'topic_id':0, 'parent_id':0, 'child_id':1, 'sibling_rank':0, 'displayed_on_dashboard': 0}
math_row = {'topic_id':1, 'parent_id':0, 'child_id':109, 'sibling_rank':0, 'displayed_on_dashboard': 0}

topic_trees = topic_trees.append(math_row, ignore_index=True)
topic_trees = topic_trees.append(german_row, ignore_index=True)

math_topic_tree = topic_trees[(topic_trees['parent_id'].isin(math_topics)) | (topic_trees['child_id'].isin(math_topics))]
german_topic_tree = topic_trees[(topic_trees['parent_id'].isin(german_topics)) | (topic_trees['child_id'].isin(german_topics))]


# def draw_topic_tree(topic_tree, title, node_color, figsize):
#   G = nx.from_pandas_edgelist(topic_tree, source='child_id', target='parent_id', edge_attr=['topic_id'])
#   plt.figure(figsize=figsize)
#   options = {"edge_color": "tab:gray", "node_color": node_color, "node_size": 100, "alpha": 0.8, "font_size": 7}
#   nx.draw_networkx(G, pos=nx.spring_layout(G), **options)
#   plt.title(title)
#   plt.show()

# draw_topic_tree(german_topic_tree, "Math topic tree", 'tab:red', (13, 12))


G = nx.from_pandas_edgelist(topic_trees, source='child_id', target='parent_id', edge_attr=['topic_id'])

PATHS = dict(nx.all_pairs_shortest_path_length(G, 20))

  topic_trees = topic_trees.append(math_row, ignore_index=True)
  topic_trees = topic_trees.append(german_row, ignore_index=True)


In [3]:


transactions = transactions[~transactions['topic_id'].isna()]
transactions = transactions[~transactions['user_id'].isna()]

transactions['transaction_id'] = transactions['transaction_id'].astype(int)
transactions['user_id'] = transactions['user_id'].astype(int)

def compute_percentage_correct(transactions_df):

    topics = pd.DataFrame({'topic_id': transactions_df['topic_id'].unique(), 'dummy': transactions['topic_id'].unique()})
    topics = topics.set_index('topic_id', drop=True)

    partial_per_topic = transactions_df[transactions_df['evaluation'] == 'PARTIAL'].groupby('topic_id').count()['transaction_id']
    correct_per_topic = transactions_df[transactions_df['evaluation'] == 'CORRECT'].groupby('topic_id').count()['transaction_id']
    wrong_per_topic = transactions_df[transactions_df['evaluation'] == 'WRONG'].groupby('topic_id').count()['transaction_id']

    ppu_keys = partial_per_topic.keys()
    cpu_keys = correct_per_topic.keys()
    wpu_keys = wrong_per_topic.keys()

    def correctness_score(row):
        tid = row.name
        
        n_wrong = 0 if tid not in wpu_keys else wrong_per_topic[tid]
        n_partial = 0 if tid not in ppu_keys else partial_per_topic[tid]
        n_correct = 0 if tid not in cpu_keys else correct_per_topic[tid]

        total = n_wrong + n_correct + n_partial

        score = 100 * (n_correct + 0.5 * n_partial)


        if total == 0:
            return 0
        
        score /= total
        
        return score

    return topics.apply(correctness_score, axis=1)

TOPIC_CORRECTNESS = compute_percentage_correct(transactions)


In [4]:
###########################################################################
# NOTE: param 'topic' should be the topic_id as found in the topics table #
###########################################################################


def generality(topic, user):
    """
    how general vs how detailed is the topic,
    distance in topic tree from root
    """

    assert topic in PATHS[0].keys() and topic != 0, "topic not found, make sure the topic id is in the original topics table"

    return 1/PATHS[0][topic]

    

def difficulty(topic, user):
    """
    difficulty of the topic,
    fraction of questions that were answered incorrectly 
    """
    
    if topic in TOPIC_CORRECTNESS.keys():
        return (100 - TOPIC_CORRECTNESS[topic])/100 

    return 0.5

def novelty(topic, user):
    """
    novelty of the topic,
    1 = new, 0 = old
    """
    
    assert topic in topics['id'].unique() and user in events['user_id'].unique(), "MESSAGE TODO:"

    user_topics = events[events['user_id'] == user]['topic_id'].unique()

    return int(topic in user_topics)


In [None]:
# TODO: vary the 'gender' feature for all the datapoints in the test set and compare recommended topics

n = 5

# TODO: replace this with the output from the NCF feature model
example_output = {
    0: {'Male': [0,1,2,3,4], 'Female': [4,5,6,7,8], 'Other': [5,6,7,8,9]},
    1: {'Male': [0,1,2,3,4], 'Female': [4,5,6,7,8], 'Other': [5,6,7,8,9]},
    2: {'Male': [0,1,2,3,4], 'Female': [4,5,6,7,8], 'Other': [5,6,7,8,9]},
}

L = n * len(example_output.keys())


metrics = ['generality', 'difficulty', 'novelty']


func = {'generality': generality, 'difficulty': difficulty, 'novelty': novelty}

male = {'generality': 0, 'difficulty': 0, 'novelty': 0}
female = {'generality': 0, 'difficulty': 0, 'novelty': 0}
other = {'generality': 0, 'difficulty': 0, 'novelty': 0}

for uid in example_output.keys():
    outputs = example_output[uid]

    for i in range(n):
        for k in metrics:
            male[k] += func[k](outputs['Male'][i], uid)/L
            female[k] += func[k](outputs['Female'][i], uid)/L
            other[k] += func[k](outputs['Other'][i], uid)/L


import matplotlib.pyplot as plt
import numpy as np

metrics_labels = ["Generality", "Difficulty", "Novelty"]
data = {
    'Female': [female[k] for k in metrics],
    'Male': [male[k] for k in metrics],
    'Other': [other[k] for k in metrics],
}

x = np.arange(len(metrics_labels))  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout='constrained')

for attribute, measurement in data.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Value')
ax.set_title('Recommended topic metrics, variation based on gender')
ax.set_xticks(x + width, metrics_labels)
ax.legend(loc='upper left', ncols=3)
ax.set_ylim(0, 250)

plt.show()


In [None]:
# TODO: for all models, compute those metrics for the predictions on the test set
# compare the models, prediction of novelty, difficulty, generality


n = 5

# TODO: replace this with the output from the different models
example_output = {
    0: {'NCF': [0,1,2,3,4], 'NCF with features': [4,5,6,7,8], 'GRU4Rec': [5,6,7,8,9]},
    1: {'NCF': [0,1,2,3,4], 'NCF with features': [4,5,6,7,8], 'GRU4Rec': [5,6,7,8,9]},
    2: {'NCF': [0,1,2,3,4], 'NCF with features': [4,5,6,7,8], 'GRU4Rec': [5,6,7,8,9]},
}

L = n * len(example_output.keys())


metrics = ['generality', 'difficulty', 'novelty']


func = {'generality': generality, 'difficulty': difficulty, 'novelty': novelty}

ncf = {'generality': 0, 'difficulty': 0, 'novelty': 0}
ncf_feature = {'generality': 0, 'difficulty': 0, 'novelty': 0}
gru4rec = {'generality': 0, 'difficulty': 0, 'novelty': 0}

for uid in example_output.keys():
    outputs = example_output[uid]

    for i in range(n):
        for k in metrics:
            ncf[k] += func[k](outputs['NCF'][i], uid)/L
            ncf_feature[k] += func[k](outputs['NCF with features'][i], uid)/L
            gru4rec[k] += func[k](outputs['GRU4Rec'][i], uid)/L


import matplotlib.pyplot as plt
import numpy as np

metrics_labels = ["Generality", "Difficulty", "Novelty"]
data = {
    'NCF': [ncf[k] for k in metrics],
    'NCF with features': [ncf_feature[k] for k in metrics],
    'GRU4Rec': [gru4rec[k] for k in metrics],
}

x = np.arange(len(metrics_labels))  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout='constrained')

for attribute, measurement in data.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Value')
ax.set_title('Recommended topic metrics, comparison of different models')
ax.set_xticks(x + width, metrics_labels)
ax.legend(loc='upper left', ncols=3)
ax.set_ylim(0, 250)

plt.show()


In [None]:
# TODO: implement LIME-RS for the models, look at some specific test examples and compare