In [1]:
import pandas as pd
import numpy as np
import bcubed

# Load TSVs

In [7]:
#user_emb_2d = pd.read_csv("./data/users_2d.tsv", sep='\t', header=None)
#user_emb_2d_labels = pd.read_csv("./data/users_by_title_embeddings_source_labels.tsv", sep='\t', header=None)

# Calculate BCubed for Each

In [20]:
def calc_bcubed(clusters, real):
    cdict = clusters.to_dict()
    ldict = real.to_dict()
    cdict = {k:set([v]) for k, v in cdict.items()}
    ldict = {k:set([v]) for k, v in ldict.items()}
    print("Calculating precision...")
    precision = bcubed.precision(cdict, ldict)
    print("Calculating recall...")
    recall = bcubed.recall(cdict, ldict)
    print("Calculating f-score...")
    fscore = bcubed.fscore(precision, recall)
    print(f"Precision: {round(precision, 4)}")
    print(f"Recall: {round(recall, 4)}")
    print(f"F-Score: {round(fscore, 4)}")
    
    num_c = clusters.nunique()
    TRIALS = 10

    print(f"Randomly generating {num_c} clusters for {TRIALS} trials...")
    bp_sum = 0.0
    br_sum = 0.0
    bf_sum = 0.0

    for f in range(TRIALS):
        cdict2 = {k:set([np.random.randint(num_c)]) for k in range(len(cdict))}
        bp = bcubed.precision(cdict2, ldict)
        br = bcubed.recall(cdict2, ldict)
        bf = bcubed.fscore(bp, br)
        
        bp_sum += bp
        br_sum += br
        bf_sum += bf

    print(f"Baseline precision: {round(bp_sum/TRIALS, 4)}")
    print(f"Baseline recall: {round(br_sum/TRIALS, 4)}")
    print(f"Baseline F-Score: {round(bf_sum/TRIALS, 4)}")

In [21]:
def load_and_calc(cluster_filename, gold_standard_filename):
    clusters = pd.read_csv(cluster_filename, sep='\t')
    gold = pd.read_csv(gold_standard_filename, sep='\t')
    calc_bcubed(clusters["0"], gold['source']) # also a good game engine <3

## 3D User Embedding Leaf

In [22]:
load_and_calc("./data/users_hdbscan_labels_3d_leaf_mc15_ms15.tsv", "./data/users_by_title_embeddings_source_labels.tsv")

Calculating precision...
Calculating recall...
Calculating f-score...
Precision: 0.9172
Recall: 0.2926
F-Score: 0.4436
Randomly generating 17 clusters for 10 trials...
Baseline precision: 0.865
Baseline recall: 0.0599
Baseline F-Score: 0.112


## 3D User Embedding EOM

In [23]:
load_and_calc("./data/users_hdbscan_labels_3d_eom_mc15_ms15.tsv", "./data/users_by_title_embeddings_source_labels.tsv")

Calculating precision...
Calculating recall...
Calculating f-score...
Precision: 0.8753
Recall: 0.9804
F-Score: 0.9249
Randomly generating 2 clusters for 10 trials...
Baseline precision: 0.8638
Baseline recall: 0.5006
Baseline F-Score: 0.6339


## 2D User Embedding Leaf

In [24]:
load_and_calc("./data/users_hdbscan_labels_2d_leaf_mc15_ms15.tsv", "./data/users_by_title_embeddings_source_labels.tsv")

Calculating precision...
Calculating recall...
Calculating f-score...
Precision: 0.8932
Recall: 0.3945
F-Score: 0.5472
Randomly generating 22 clusters for 10 trials...
Baseline precision: 0.8652
Baseline recall: 0.0465
Baseline F-Score: 0.0883


## 2D User Embedding EOM

In [25]:
load_and_calc("./data/users_hdbscan_labels_2d_eom_mc15_ms15.tsv", "./data/users_by_title_embeddings_source_labels.tsv")

Calculating precision...
Calculating recall...
Calculating f-score...
Precision: 0.8754
Recall: 0.9549
F-Score: 0.9134
Randomly generating 4 clusters for 10 trials...
Baseline precision: 0.8641
Baseline recall: 0.2509
Baseline F-Score: 0.3889
