In [None]:
from pathlib import Path

from dataclasses import asdict
import pandas as pd
import numpy as np

from tqdm.autonotebook import tqdm
import plotly.express as px

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

from recsys4daos.utils.notebooks import DaoToRun, isCompleted

import paths

In [None]:
OUTPUT_PATH: str = '../nbout/'
TARGET_METRIC: str = 'map@10'
RESULTS_SUFFIX: str = 'best-test'
MODELS_ORDER = ['openpop', 'userknn', 'itemknn', 'plnsim', 'lightgcn']
MODELS_NICE_NAME = {
    'openpop': 'OpenPop',
    'userknn': 'UserKNN',
    'itemknn': 'ItemKNN',
    'plnsim': 'PLNsim',
    'lightgcn': 'LightGCN',
}

In [None]:
ALL_DAOS = paths.load_daos_to_run()
DAOS_INFO = pd.DataFrame.from_dict(paths.load_daos_data(), orient='index')
DAOS_DF = pd.DataFrame(asdict(d) for d in ALL_DAOS).join(DAOS_INFO, on='org_name', validate='1:1')
print(DAOS_DF.columns)
DAOS_DF

In [None]:
def index_to_hparams(df: pd.DataFrame):
    assert df.index.names[0] == 'fold'
    hparams = df.index.names[1:]
    df = df.reset_index()
    df['hparams'] = df[hparams].apply(lambda x: x.to_dict(), axis=1)
    df = df.drop(columns=hparams)
    return df

def is_nb_ok(d: DaoToRun, nbname: str) -> bool:
    notebook = Path(OUTPUT_PATH) / d.org_name / nbname
    if not notebook.is_file():
        return False
    return isCompleted(notebook)

## Loading baseline

In [None]:
baseline_daos = [ d for d in ALL_DAOS if is_nb_ok(d, '10_baseline.ipynb') ]
baseline_daos

In [None]:
def _load_openpop_results(d: DaoToRun):
    df = paths.load_openpop(d.org_name, d.splits_freq, d.splits_normalize).sort_index()
    if d.last_fold_date:
        df = df[df.index <= d.last_fold_date]
    df = df.tail(d.last_folds - 1).copy()

    return df
    # dfi = paths.load_folds_info(d.org_name, d.splits_freq, d.splits_normalize).sort_index()
    # return df.join(dfi, how='left', validate='1:1')

openpop_baselines = pd.concat([
  _load_openpop_results(d).assign(**asdict(d))
    for d 
    in sorted(baseline_daos, key=lambda x: x.org_name.lower())
])
openpop_baselines

In [None]:
openpop_baselines.groupby('org_name').describe()

## ItemKNN results

In [None]:
cfknn_daos = [ d for d in ALL_DAOS if is_nb_ok(d, '11_CF_KNN.ipynb') ]
cfknn_daos

In [None]:
itemknn_results = pd.concat([
  paths.get_model_results(f'itemknn-{RESULTS_SUFFIX}', d.org_name, d.splits_freq, d.splits_normalize).assign(**asdict(d))
    for d 
    in sorted(cfknn_daos, key=lambda x: x.org_name.lower())
])
itemknn_results

In [None]:
sns.barplot(itemknn_results, x=TARGET_METRIC, y='org_name')

## UserKNN results

In [None]:
userknn_results = pd.concat([
  paths.get_model_results(f'userknn-{RESULTS_SUFFIX}', d.org_name, d.splits_freq, d.splits_normalize).assign(**asdict(d))
    for d 
    in sorted(cfknn_daos, key=lambda x: x.org_name.lower())
])
userknn_results

In [None]:
sns.barplot(userknn_results, x=TARGET_METRIC, y='org_name')

### Comparison between UserKNN and ItemKNN

In [None]:
cfknn_results = pd.concat([
    index_to_hparams(openpop_baselines.assign(model='openpop')),
    index_to_hparams(userknn_results.assign(model='userknn')),
    index_to_hparams(itemknn_results.assign(model='itemknn')),
])
sns.barplot(cfknn_results, x=TARGET_METRIC, y='org_name', hue='model', errorbar=None)

In [None]:
cfknn_results[cfknn_results['org_name'] == 'Balancer'].groupby(['fold', 'model']).size()

In [None]:
DAOS_DF[DAOS_DF['last_fold_date'].isna()]

In [None]:
models_ok = cfknn_results.groupby('org_name').apply(lambda x: x.groupby('fold').size().nunique() == 1)
if not models_ok.all():
    print(models_ok[~models_ok])
    assert False, 'Some DAOs have different models in different folds'

## PLN results

In [None]:
pln_daos = [ d for d in ALL_DAOS if is_nb_ok(d, '12_PLN_Similarity.ipynb') ]
pln_daos

In [None]:
pln_results = pd.concat([
  paths.get_model_results(f'plnsim-{RESULTS_SUFFIX}', d.org_name, d.splits_freq, d.splits_normalize).assign(**asdict(d))
    for d 
    in sorted(pln_daos, key=lambda x: x.org_name.lower())
])
pln_results

In [None]:
sns.barplot(pln_results, x=TARGET_METRIC, y='org_name')

### Comparison with previous models

In [None]:
cfknnpln_results = pd.concat([
    cfknn_results,
    index_to_hparams(pln_results.assign(model='plnsim')),
])
# sns.barplot(cfknnpln_results, x=TARGET_METRIC, y='org_name', hue='model', errorbar=None)
px.bar(cfknnpln_results.groupby(['org_name', 'model']).mean(numeric_only=True).reset_index(), x='org_name', y=TARGET_METRIC, color='model', barmode='group')

## LightGCN results

In [None]:
gnn_daos = [ d for d in ALL_DAOS if is_nb_ok(d, '21_microsoft_results.ipynb') ]
gnn_daos

In [None]:
gnn_results = pd.concat([
  paths.get_model_results(f'lightgcn-{RESULTS_SUFFIX}', d.org_name, d.splits_freq, d.splits_normalize).assign(**asdict(d))
    for d 
    in sorted(gnn_daos, key=lambda x: x.org_name.lower())
])
gnn_results

In [None]:
from dataclasses import fields

MODELS = ['openpop', 'itemknn', 'userknn', 'plnsim', 'lightgcn']

all_results = pd.concat([
    cfknnpln_results,
    index_to_hparams(gnn_results.assign(model='lightgcn')),
])

_toplot = all_results.groupby(['org_name', 'model']).mean(numeric_only=True).reset_index()
_toplot = _toplot.merge(DAOS_DF, on='org_name')
# sns.barplot(cfknnpln_results, x=TARGET_METRIC, y='org_name', hue='model', errorbar=None)
print(DAOS_DF.columns)
px.bar(
    _toplot,
    x='org_name', 
    y='map@5',
    color='model', 
    barmode='group', 
    category_orders={'model': MODELS},
    hover_data=['last_fold_date', 'comment', 'folds_avg_open_proposals', 'folds_avg_test_users', 'folds_avg_test_votes', 'folds_avg_test_vpp'],
)

In [None]:
from dataclasses import fields

MODELS = ['openpop', 'itemknn', 'userknn', 'plnsim', 'lightgcn']

all_results = pd.concat([
    cfknnpln_results,
    index_to_hparams(gnn_results.assign(model='lightgcn')),
])

_toplot = all_results.groupby(['org_name', 'model']).mean(numeric_only=True).reset_index()
_toplot = _toplot.merge(DAOS_DF, on='org_name')
# sns.barplot(cfknnpln_results, x=TARGET_METRIC, y='org_name', hue='model', errorbar=None)
print(DAOS_DF.columns)
px.bar(
    _toplot,
    x='org_name', 
    y='precision@1',
    color='model', 
    barmode='group', 
    category_orders={'model': MODELS},
    hover_data=['last_fold_date', 'comment', 'folds_avg_open_proposals', 'folds_avg_test_users', 'folds_avg_test_votes', 'folds_avg_test_vpp'],
)

In [None]:
from dataclasses import fields

MODELS = ['openpop', 'itemknn', 'userknn', 'plnsim', 'lightgcn']

all_results = pd.concat([
    cfknnpln_results,
    index_to_hparams(gnn_results.assign(model='lightgcn')),
])

_toplot = all_results.groupby(['org_name', 'model']).mean(numeric_only=True).reset_index()
_toplot = _toplot.merge(DAOS_DF, on='org_name')
# sns.barplot(cfknnpln_results, x=TARGET_METRIC, y='org_name', hue='model', errorbar=None)
print(DAOS_DF.columns)
px.bar(
    _toplot,
    x='org_name', 
    y='r-precision@10',
    color='model', 
    barmode='group', 
    category_orders={'model': MODELS},
    hover_data=['last_fold_date', 'comment', 'folds_avg_open_proposals', 'folds_avg_test_users', 'folds_avg_test_votes', 'folds_avg_test_vpp'],
)

In [None]:
gnn_results['org_name'].unique()

In [None]:
from dataclasses import fields

ALLOWED_DAOS = gnn_results['org_name'].unique()

all_results = pd.concat([
    cfknnpln_results,
    index_to_hparams(gnn_results.assign(model='lightgcn')),
])

_toplot = all_results.groupby(['org_name', 'model']).mean(numeric_only=True).reset_index()
# Remove DAOs with no GNN results
_toplot = _toplot[_toplot['org_name'].isin(ALLOWED_DAOS)]
_toplot = _toplot.merge(DAOS_DF, on='org_name')
# sns.barplot(cfknnpln_results, x=TARGET_METRIC, y='org_name', hue='model', errorbar=None)
print(DAOS_DF.columns)
px.bar(
    _toplot,
    x='org_name', 
    y='map@5',
    color='model', 
    barmode='group', 
    category_orders={'model': MODELS_ORDER},
    hover_data=['last_fold_date', 'comment', 'folds_avg_open_proposals', 'folds_avg_test_users', 'folds_avg_test_votes', 'folds_avg_test_vpp'],
)

In [None]:
models_ok = all_results.groupby('org_name').apply(lambda x: x.groupby('fold').size().nunique() == 1)
if not models_ok.all():
    print(models_ok[~models_ok])
    assert False, 'Some orgs have different models in different folds'

In [None]:
all_results[all_results['org_name'] == 'Decentraland'][['fold', 'org_name', 'ndcg@10', 'model']]

In [None]:
all_results.groupby('org_name').apply(lambda x: x.groupby('fold').size())

In [None]:
all_results.groupby(['org_name', 'fold']).size()

# Comparing all

In [None]:
all_results

In [None]:
mean_all_results = all_results.groupby(['org_name', 'model']).mean(numeric_only=True).reset_index()
mean_all_results = mean_all_results[mean_all_results['org_name'].isin(ALLOWED_DAOS)].merge(DAOS_DF, on='org_name')
mean_all_results

### Is there any correlation between variables?

In [None]:
_corr = mean_all_results[[c for c in mean_all_results.columns if not '@' in c or '@3' in c]].groupby(['org_name', 'model']).mean(numeric_only=True).corr(method='spearman')
# _corr.stack().reset_index()
sns.heatmap(_corr, annot=False, cmap='coolwarm')

In [None]:
_pairs = _corr.stack().reset_index()
_pairs = _pairs[_pairs['level_0'] != _pairs['level_1']]
_pairs = _pairs[_pairs['level_0'].str.contains('@') & ~_pairs['level_1'].str.contains('@')]
_pairs = _pairs.sort_values(0, key=abs, ascending=False)
_pairs.head(20)

In [None]:
mean_all_results[[c for c in mean_all_results.columns]].groupby(['org_name', 'model']).mean(numeric_only=True).corr(method='spearman')['open_proposals'].dropna().sort_values()

### Cute lolliplot

In [None]:
def lollipop_baseline_vs(metric: str, *, plot_others: bool = False, plot_worst: bool = True, sort_by=None):
    _pivot = mean_all_results.pivot(index='org_name', columns='model', values=metric)
    _pivot['best_name'] = _pivot.drop(columns='openpop').idxmax(axis=1)
    _pivot['best'] = _pivot.apply(lambda row: row[row['best_name']], axis=1)
    _pivot['worst_name'] = _pivot.drop(columns=['openpop', 'best_name']).idxmin(axis=1)
    _pivot['worst'] = _pivot.apply(lambda row: row[row['worst_name']], axis=1)

    if sort_by is None:
        _pivot = _pivot.sort_index(ascending=False, key=lambda x: x.str.lower())
    else:
        _pivot = _pivot.join(DAOS_DF.set_index('org_name'))
        _pivot = _pivot.sort_values(sort_by)
    
    _pivot = _pivot.rename(index={
        'DEAD FoundationsDAO': 'DEAD Foundations',
        'MetaCartel - MetaCartel Ventures': 'MC - MC Ventures',
    }).reset_index()
    
    plt.hlines(y=_pivot.index, xmin=_pivot.min(axis=1, numeric_only=True), xmax=_pivot['best'], color='grey', zorder=1, alpha=0.6)
    plt.scatter(_pivot['openpop'], _pivot.index, color='skyblue', label='OpenPop')
    
    markers_dict = {
        "itemknn": "P", # plus filled
        "userknn": "X", # cross filled
        "openpop": "o", # circle filled
        "plnsim": "s", # square filled
        "lightgcn": "^", # triangle filled
    }
    colors_dict = {
        "openpop": "lightblue", # circle filled
        "itemknn": "C0", # plus filled
        "userknn": "C1", # cross filled
        "plnsim": "C2", # square filled
        "lightgcn": "C3", # triangle filled
    }
    
    # for m in sorted(_pivot['best_name'].unique()):    
    for m in MODELS_ORDER:
        if m =='openpop':
            # Already done
            continue
        
        if plot_others:
            _aux = _pivot[_pivot['best_name'] != m]
            plt.scatter(_aux[m], _aux.index, marker=markers_dict[m], color=colors_dict[m], s=15)
        
        _aux = _pivot[_pivot['best_name'] == m]
        plt.scatter(_aux['best'], _aux.index, marker=markers_dict[m], color=colors_dict[m], label=MODELS_NICE_NAME[m], s=35)

        if plot_worst:
            _aux = _pivot[_pivot['worst_name'] == m]
            plt.scatter(_aux['worst'], _aux.index, marker=markers_dict[m], color=colors_dict[m], s=35)

    # Sort legend
    
    plt.legend(loc='lower left', mode='expand', bbox_to_anchor=(0, 1.02, 1, 0.2), ncol=5, borderaxespad=0, handletextpad=-.2)
    plt.yticks(_pivot.index, _pivot['org_name'])
    plt.xlabel(metric)

lollipop_baseline_vs('precision@3', sort_by='folds_avg_open_proposals')
plt.xlim((0,1))

In [None]:
#TODO: DELETEME
mean_all_results.pivot(index='org_name', columns='model', values='precision@3').loc['DEAD FoundationsDAO']

In [None]:
_metric_cols = [c for c in mean_all_results if '@' in c and '_' not in c]

In [None]:
sns.set_context('paper')

metric = 'map@10'
for metric in tqdm(_metric_cols):
    plt.figure()
    lollipop_baseline_vs(metric, sort_by='folds_avg_open_proposals')
    plt.xlabel('')
    plt.savefig(f'../figures/all_results/lollipop_results_{metric}.png', dpi=600, bbox_inches='tight')
    plt.close()

sns.set_context('notebook')

In [None]:
print(DAOS_DF.columns)
DAOS_DF[DAOS_DF['org_name'].isin(mean_all_results['org_name'].unique())].set_index('org_name')['folds_avg_test_vpv'].median()

## Choosing "the best"

In this section we will try to fill the question: what is the best overall model? And if that method is consistent among metrics

In [None]:
_table = mean_all_results.groupby('model')[_metric_cols].median().T[MODELS_ORDER]

def highlight_max(s):
    return np.where(s == s.max(), 'background-color: yellow', '')
def highlight_min(s):
    return np.where(s == s.min(), 'background-color: yellow', '')

_table.style.apply(highlight_max, axis=1)

This is very unreliable because we are comparing between models results. It will prioritize models that perhaps have a higher baseline. Let's not check how much each model is against the baseline.

### Relative comparison (model / baseline)

In [None]:
def _rel_to_openpop(df: pd.DataFrame):
    df = df.set_index('model')[_metric_cols]
    return df.div(df.loc['openpop'])
    
rel_all_results = mean_all_results.groupby('org_name').apply(_rel_to_openpop, include_groups=False)
rel_all_results

In [None]:
rel_all_results.groupby('model').median().T[MODELS_ORDER].style.apply(highlight_max, axis=1)

### Absolute comparison (model - baseline)

In [None]:
def _abs_to_openpop(df: pd.DataFrame):
    df = df.set_index('model')[_metric_cols]
    return df.sub(df.loc['openpop'])
    
abs_all_results = mean_all_results.groupby('org_name').apply(_abs_to_openpop, include_groups=False)
abs_all_results

In [None]:
abs_all_results.groupby('model').median().T[MODELS_ORDER].style.apply(highlight_max, axis=1)

### Ranking the models

In [None]:
def _ranking(df: pd.DataFrame):
    df = df.set_index('model')[_metric_cols]
    return df.rank(ascending=False)

rank_all_results = mean_all_results.groupby('org_name').apply(_ranking, include_groups=False)
rank_all_results

In [None]:
rank_all_results.groupby('model').median().T[MODELS_ORDER].style.apply(highlight_min, axis=1)

#### Big vs small daos

In [None]:
DAOS_INFO.columns

In [None]:
_CUT_PCT = .75
_CUT_COL = 'folds_avg_open_proposals'
_CUT = DAOS_INFO['folds_avg_open_proposals'].quantile(.75)
print(f"{_CUT_PCT*100}% of {_CUT_COL}: {_CUT}")

_aux = rank_all_results.reset_index('model').join(DAOS_INFO[['folds_avg_open_proposals']])

_big = _aux[_aux['folds_avg_open_proposals'] >= _CUT]
_small = _aux[_aux['folds_avg_open_proposals'] < _CUT]
print('Big:', _big.index.nunique(), '; small:', _small.index.nunique())

In [None]:
_big.groupby('model').median().T[MODELS_ORDER].style.apply(highlight_min, axis=1)

In [None]:
_small.groupby('model').median().T[MODELS_ORDER].style.apply(highlight_min, axis=1)

## How many times does each model win the baseline

In [None]:
def _better_than_openpop(df: pd.DataFrame):
    df = df.set_index('model')[_metric_cols]
    return df > df.loc['openpop']

bool_all_results = mean_all_results.groupby('org_name').apply(_better_than_openpop, include_groups=False)
bool_all_results

In [None]:
bool_all_results.groupby('model').sum().T[MODELS_ORDER].style.apply(highlight_max, axis=1)

## How many times is that model the best

In [None]:
def _is_best(df: pd.DataFrame):
    df = df.set_index('model')[_metric_cols]
    return df == df.max(axis=0)

is_best_all_results = mean_all_results.groupby('org_name').apply(_is_best, include_groups=False)
is_best_all_results

In [None]:
is_best_all_results.groupby('model').sum().T[MODELS_ORDER].style.apply(highlight_max, axis=1)

### Big vs small

In [None]:
_CUT_PCT = .75
_CUT_COL = 'folds_avg_open_proposals'
_CUT = DAOS_INFO['folds_avg_open_proposals'].quantile(.75)
print(f"{_CUT_PCT*100}% of {_CUT_COL}: {_CUT}")

_aux = is_best_all_results.reset_index('model').join(DAOS_INFO[['folds_avg_open_proposals']])

_big = _aux[_aux['folds_avg_open_proposals'] >= _CUT]
_small = _aux[_aux['folds_avg_open_proposals'] < _CUT]
print('Big:', _big.index.nunique(), '; small:', _small.index.nunique())

In [None]:
_big.groupby('model').sum().T[MODELS_ORDER].astype(int).style.apply(highlight_max, axis=1)

In [None]:
_small.groupby('model').sum().T[MODELS_ORDER].astype(int).style.apply(highlight_max, axis=1)

### Creating a waffle graph

In [None]:
from pywaffle import Waffle

In [None]:
_WC_METRIC = 'precision@5'

_aux_best = is_best_all_results.groupby('model').sum().T[MODELS_ORDER].T
print(_aux_best[_WC_METRIC])

fig = plt.figure(
    FigureClass=Waffle,
    rows=3,
    values=_aux_best[_WC_METRIC].to_dict(),
    # legend={'loc': 'lower left', 'bbox_to_anchor': (0, -0.2), 'ncol': 5},
    legend={'loc': 'upper left', 'bbox_to_anchor': (1.05, 1)},
)
plt.title(f'Waffle chart {_WC_METRIC}')