In [1]:
from typing import Dict, List, Tuple, Union, Any, Optional

import pandas as pd
import numpy as np
import os

from pathlib import Path

import torch
import ray
from ray import train, tune

from src.datasets import daocensus

%load_ext autoreload
%autoreload 2

### Parameters

In [2]:
TRANSFORMER_MODEL = "all-mpnet-base-v2"

In [3]:
dfptext = pd.read_csv('./snapshot_proposals.csv')[['proposal_id', 'title', 'description']]
dfv, dfp = daocensus.get("./data/daos-census", 'Decentraland', 'snapshot')
dfv['voter'] = dfv['voter'].astype('str')
dfp = dfp.merge(dfptext, how='left', left_on='platform_proposal', right_on='proposal_id')
print(dfv.info())
print(dfp.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116560 entries, 0 to 116559
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   platform       116560 non-null  object        
 1   name           116560 non-null  object        
 2   id             116560 non-null  object        
 3   proposal       116560 non-null  category      
 4   deployment     116560 non-null  object        
 5   platform_vote  116560 non-null  object        
 6   voter          116560 non-null  object        
 7   date           116560 non-null  datetime64[ns]
 8   choice         116560 non-null  object        
 9   weight         116560 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(1), object(7)
memory usage: 8.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1942 entries, 0 to 1941
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------ 

## Getting the embeddings of each proposal

In [4]:
from sentence_transformers import SentenceTransformer
import pickle

In [5]:
model = SentenceTransformer(TRANSFORMER_MODEL)

In [6]:
embeddings_folder = Path('./data/daos-census-text')
embeddings_file = embeddings_folder / 'embeddings.pkl'

dfp['title-description'] = dfp['title'] + '\n' + dfp['description']

if not embeddings_folder.exists():
    embeddings_folder.mkdir()

embeddings = None
if embeddings_file.exists():
    print('Embeddings file saved, reading it instead...')
    with open(embeddings_file, 'rb') as f:
        obj = pickle.load(f)
    
    embeddings = obj['embeddings']
    print("Embeddings shape:", embeddings.shape)
    ids = obj['ids']

if embeddings is None:
    embeddings = model.encode(dfp['title-description'], show_progress_bar=True, normalize_embeddings=True)
    with open(embeddings_file, 'wb+') as f:
        pickle.dump({'sentences': dfp['title-description'], 'embeddings': embeddings, 'ids': dfp['id']}, f, protocol=pickle.HIGHEST_PROTOCOL)

Embeddings file saved, reading it instead...
Embeddings shape: (1942, 768)


In [7]:
embeddings_sum = embeddings.sum(axis=0)
print(embeddings_sum.shape)
embeddings = embeddings / embeddings_sum
embeddings

(768,)


array([[ 1.1403942e-03,  8.6159637e-04,  6.9543940e-04, ...,
        -2.7190479e-03, -1.0145843e-03,  3.9847717e-03],
       [-4.0623912e-04,  4.4145231e-04,  9.4736530e-04, ...,
         1.6015382e-03,  1.8749243e-03, -2.8606903e-04],
       [ 2.3584017e-03,  5.8685150e-04,  7.1522954e-04, ...,
        -2.2754688e-03, -1.4056214e-04, -3.7364583e-03],
       ...,
       [ 7.4752036e-04,  6.6007714e-04,  6.3091837e-04, ...,
         2.4024290e-03, -2.0156223e-03,  3.5178147e-03],
       [ 1.0567969e-03,  4.8019552e-05,  3.8508396e-04, ...,
        -2.2719074e-04,  3.0133144e-03, -4.8025427e-03],
       [-7.7102159e-04,  5.3977530e-04,  6.2477676e-04, ...,
         4.1932138e-03, -3.2049862e-05,  5.8418047e-03]], dtype=float32)

In [8]:
dfp['embeddings'] = list(embeddings)

## Getting the embeddings of each user

The embeddings of each user are the embeddings of each proposal the user has voted in

In [9]:
dfp['nbidx'] = dfp.index
_dfv = dfv.merge(dfp[['id', 'nbidx']], how='left', left_on='proposal', right_on='id')
print('dfp:', dfp.shape, 'dfv:', dfv.shape, '_dfv:', _dfv.shape)
votes_embeddings = embeddings[_dfv['nbidx']]
votes_embeddings.shape

dfp: (1942, 15) dfv: (116560, 10) _dfv: (116560, 12)


(116560, 768)

In [10]:
x=np.array([3,4])
x = x / np.linalg.norm(x)
print(x)
print(np.linalg.norm(x))
print(x.sum())

[0.6 0.8]
1.0
1.4


In [11]:
voter_embeddings = dfv.groupby('voter').apply(lambda x: votes_embeddings[x.index].sum(axis=0))
voter_embeddings = voter_embeddings.apply(lambda x: x/np.linalg.norm(x))
# msk = (np.linalg.norm(np.stack(voter_embeddings.to_numpy()), axis=1) > 0.9)
voter_embeddings

voter
0x0019220ddabe5b8438a6be6ebca31ef01ca69965    [0.00057990896, 0.00088156166, 0.0009439609, 0...
0x0023a0bf1cb20362f847dfc977d2c815f6ac8021    [0.005858055, 0.0020818706, 0.002051057, 0.003...
0x0026dd985da16f70e9d9cf739b08c6cdf6a7f407    [0.0004940444, 0.0003696056, 0.00017160185, 0....
0x0029ab135b5be72168bf80f140d60a9264dbd0c5    [0.0015585453, 0.00061846466, 0.0007543223, 0....
0x003a3eb1a1d2ad3bea19ae06324727beeeec2e34    [0.0029788157, 0.0022016685, 0.00209749, 0.001...
                                                                    ...                        
0xffd92144cafd599a5a93e4805ca4d8f0e666d623    [0.0018578778, 0.0015816463, 0.001392865, 0.00...
0xffd9cd75445459184a4b89a0fbd1720ab2957738    [0.0019817022, 0.0022677148, 0.0011903326, 0.0...
0xffe6d9d29480d0673c4fc46da22d21e245c8e731    [0.0024969198, 0.0013630107, 0.0004593444, 0.0...
0xffed2f64c9f796d4258618a7a375ed816205821a    [-0.0001579282, 0.0002713099, 0.0002999211, 8....
0xfff28923092a8294456206a6d0734514

In [12]:
dfv.groupby('voter').size().sort_values()

voter
0x4c86663bbe4344b674c770464bd044bed7a7d78b      1
0x0026dd985da16f70e9d9cf739b08c6cdf6a7f407      1
0x005cee5cfb7ad2c740ab4e0cdfbcebad219bdb5f      1
0x00668bd79ede077b99bbe1c4db59418bc333d4cf      1
0x0072bb98a5444bbaf5b1dff112dbfa0f5e3f7a9f      1
                                             ... 
0x613e052555ac74ff6af0fc64e40e8035c1e9dcf8    652
0xd6eff8f07caf3443a1178407d3de4129149d6ef6    727
0xb44ac514588be99870220d12120815bc3d349507    729
0x521b0fef9cdcf250abaf8e7bc798cbe13fa98692    760
0xc54a6c3778016b06cbd126ccc3b5bc06c5f666fb    885
Length: 7268, dtype: int64

## Making recommendations

Because they are normalized, our ranking score can be just the dot product of a user and a proposal

In [13]:
user_emb = voter_embeddings['0x613e052555ac74ff6af0fc64e40e8035c1e9dcf8']
scores = embeddings @ user_emb
scores.argsort()

array([ 324,  940,  320, ...,  413, 1134,  457])

In [14]:
list(range(10))[-3:]

[7, 8, 9]

In [15]:
def recommend_emb(emb, k):
    # 1D n_proposals
    scores = embeddings @ emb
    best = (-scores).argsort()

    dfbest = dfp.iloc[best[:k]].copy()
    dfbest['score'] = scores[best[:k]]
    return dfbest

In [16]:
def recommend_user(voter: str, k: int = 5):
    # 1D emb_size
    user_emb = voter_embeddings[voter]
    return recommend_emb(user_emb, k)

recommend_user('0x613e052555ac74ff6af0fc64e40e8035c1e9dcf8')[['title', 'description', 'score']]

Unnamed: 0,title,description,score
457,Logarithmic VP conversion after a certain amou...,> by 0x5985eb4a8e0e1f7bca9cc0d7ae81c2943fb205b...,6.907498
1134,Should Voter Power Delegation be activated in ...,> by 0x9982b469910c2ee2ea566dcfcc250cdd3405639...,5.945354
413,Can we implement 'R' and 'G' keys to allow for...,> by 0xb2223f4038def8a62a86e3c4b108cdfe00a74c4...,5.478396
873,Should VP keep assigned to the landowner in a ...,> by 0x87956abc4078a0cc3b89b419928b857b8af826e...,5.168402
1736,Create Group Chats between Mutual Friends,> by 0x988262eb3225ea4690f6b4846e36c700345bf6f...,4.962543


In [17]:
def recommend_proposal(proposal: str, k: int = 5):
    # 1D emb_size
    prop_emb = dfp.set_index('id')['embeddings'][proposal]
    return recommend_emb(prop_emb, k)

display(dfp.set_index('id').loc['433b7e43-77b0-5ea1-bac3-0c3071363a56'])
recommend_proposal('433b7e43-77b0-5ea1-bac3-0c3071363a56')

platform                                                        snapshot
name                                                        Decentraland
platform_deployment                                     snapshot.dcl.eth
deployment                          41fd8de5-f8e2-5023-86a3-825c49e9ad7f
platform_proposal      0xdbe9b2df1d00c0d264bffb3f10f1332a82d8a8a70c12...
author                        0xbb7b59afa3a0e5be143b8fe9c641f00c1ecb9d69
date                                                 2023-02-14 16:19:06
votes_count                                                          385
proposal_id            0xdbe9b2df1d00c0d264bffb3f10f1332a82d8a8a70c12...
title                  Should DAO proposals have a pre-defined 'Absta...
description            > by 0x511a22cdd2c4ee8357bb02df2578037ffe8a4d8...
title-description      Should DAO proposals have a pre-defined 'Absta...
embeddings             [-0.00036667797, 0.00013968443, -0.00012323353...
nbidx                                              

Unnamed: 0,platform,name,platform_deployment,id,deployment,platform_proposal,author,date,votes_count,proposal_id,title,description,title-description,embeddings,nbidx,score
940,snapshot,Decentraland,snapshot.dcl.eth,5f8989a6-464f-57ef-bbee-74f3ce8b001f,41fd8de5-f8e2-5023-86a3-825c49e9ad7f,QmcY88YFJLgiMatBPD1j9KUxBwPhymW5DdVP4JQDj53jYr,0x5e23d08324f017d5425e59a2782c9ae27ace0958,2021-12-23 22:13:33,14,QmcY88YFJLgiMatBPD1j9KUxBwPhymW5DdVP4JQDj53jYr,Petition to add event reviews so we can improv...,> by 0xbef69e080a0b127f7cf6f3b658f07c90588cea6...,Petition to add event reviews so we can improv...,"[-0.0015171153, 0.00076421956, 0.00042039537, ...",940,2.341274
1537,snapshot,Decentraland,snapshot.dcl.eth,613dbb55-2dd7-51fc-a289-c1a39a6461ff,41fd8de5-f8e2-5023-86a3-825c49e9ad7f,0xfad0ce3f336f9104fd8d5cc2cb3c012c21a2116d7f7b...,0xbb7b59afa3a0e5be143b8fe9c641f00c1ecb9d69,2023-03-06 20:56:57,96,0xfad0ce3f336f9104fd8d5cc2cb3c012c21a2116d7f7b...,Community support for a paid pre-check on link...,> by 0x895be97bdb9f8a244c472b18ea96dee39ddf8fe...,Community support for a paid pre-check on link...,"[-0.00037167326, 0.00045459936, 8.63669e-05, -...",1537,2.167494
1644,snapshot,Decentraland,snapshot.dcl.eth,a7eabea1-95b5-5621-8bc5-8f09f313c45e,41fd8de5-f8e2-5023-86a3-825c49e9ad7f,bafkreidmcnjlewiwuvyty5k2atld2qtnesk5dky2gwhwo...,0x5e23d08324f017d5425e59a2782c9ae27ace0958,2022-09-07 15:39:27,92,bafkreidmcnjlewiwuvyty5k2atld2qtnesk5dky2gwhwo...,Should there be limited time to change voting ...,> by 0x247e0896706bb09245549e476257a0a1129db41...,Should there be limited time to change voting ...,"[0.00034874413, 0.00051068194, 0.00022908015, ...",1644,2.03605
1242,snapshot,Decentraland,snapshot.dcl.eth,5be72d6c-2388-5aaa-8ac6-20be847a82cc,41fd8de5-f8e2-5023-86a3-825c49e9ad7f,0xdce2b640a47ba0c44bfa9e9a88ad8bffb9a5a3ffbff5...,0xbb7b59afa3a0e5be143b8fe9c641f00c1ecb9d69,2023-01-06 17:54:38,201,0xdce2b640a47ba0c44bfa9e9a88ad8bffb9a5a3ffbff5...,Should DAO proposals have a pre-defined 'Absta...,> by 0x511a22cdd2c4ee8357bb02df2578037ffe8a4d8...,Should DAO proposals have a pre-defined 'Absta...,"[-0.0007028319, 0.00016267561, -7.843084e-06, ...",1242,2.001708
1149,snapshot,Decentraland,snapshot.dcl.eth,62662ecc-3126-5798-bbd9-b1421082be58,41fd8de5-f8e2-5023-86a3-825c49e9ad7f,QmVCfrGX892iaDLkp5dBNJRa93U2DVbg3JNcD14mfGvMS1,0x5e23d08324f017d5425e59a2782c9ae27ace0958,2022-05-19 23:18:01,161,QmVCfrGX892iaDLkp5dBNJRa93U2DVbg3JNcD14mfGvMS1,MANA incentives for USDC-ICE LP on QuickSwap,> by 0x521b0fef9cdcf250abaf8e7bc798cbe13fa9869...,MANA incentives for USDC-ICE LP on QuickSwap\n...,"[0.00090394466, 0.0004753961, 0.000633631, 0.0...",1149,1.981512


## Evaluation

First, we will evaluate if the user recommendations are any good using common off-line metrics.

In [18]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k

In [19]:
dfv['rating'] = 1
np_voter_embeddings = np.stack(voter_embeddings.to_numpy())
print(np_voter_embeddings.shape)

(7268, 768)


In [20]:
def recommend_all(k: int = 5):
    scores = np_voter_embeddings @ embeddings.T
    best = (-scores).argsort(axis=1)
    topk = best[:, :k]

    # create df with columns
    # userID, itemID, prediction
    uid = np.repeat(np.arange(np_voter_embeddings.shape[0]), k)
    iid = topk.flatten()

    # transform int to id

    return pd.DataFrame({
        'voter': voter_embeddings.index[uid],
        'proposal': ids[iid],
        'prediction': 1,
        'score': scores[uid, iid],
    })

recommend_all(2)

Unnamed: 0,voter,proposal,prediction,score
324,0x0019220ddabe5b8438a6be6ebca31ef01ca69965,b8df528b-a34e-5e1a-afe2-6d2615054032,1,4.027319
940,0x0019220ddabe5b8438a6be6ebca31ef01ca69965,5f8989a6-464f-57ef-bbee-74f3ce8b001f,1,3.714600
940,0x0023a0bf1cb20362f847dfc977d2c815f6ac8021,5f8989a6-464f-57ef-bbee-74f3ce8b001f,1,3.817560
324,0x0023a0bf1cb20362f847dfc977d2c815f6ac8021,b8df528b-a34e-5e1a-afe2-6d2615054032,1,3.368389
940,0x0026dd985da16f70e9d9cf739b08c6cdf6a7f407,5f8989a6-464f-57ef-bbee-74f3ce8b001f,1,4.873110
...,...,...,...,...
940,0xffe6d9d29480d0673c4fc46da22d21e245c8e731,5f8989a6-464f-57ef-bbee-74f3ce8b001f,1,4.668789
324,0xffed2f64c9f796d4258618a7a375ed816205821a,b8df528b-a34e-5e1a-afe2-6d2615054032,1,4.856295
940,0xffed2f64c9f796d4258618a7a375ed816205821a,5f8989a6-464f-57ef-bbee-74f3ce8b001f,1,4.709621
457,0xfff28923092a8294456206a6d0734514bebcbe67,11946f30-e09f-5638-be0a-82dff99d3112,1,6.668961


In [21]:
print('map:', map_at_k(dfv, recommend_all(5), k=5, col_user='voter', col_item='proposal'))
print('ndcg:', ndcg_at_k(dfv, recommend_all(5), k=5, col_user='voter', col_item='proposal'))
print('precision:', precision_at_k(dfv, recommend_all(5), k=5, col_user='voter', col_item='proposal'))
print('recall:', recall_at_k(dfv, recommend_all(5), k=5, col_user='voter', col_item='proposal'))

map: 0.003617985964794645
ndcg: 0.023747986640875404
precision: 0.017996697853604842
recall: 0.006523095073203304


In [22]:
dfv['prediction'] = 1
print('map:', map_at_k(dfv, dfv, k=5, relevancy_method='top_k', col_user='voter', col_item='proposal'))
print('ndcg:', ndcg_at_k(dfv, dfv, k=5, relevancy_method='top_k', col_user='voter', col_item='proposal'))
print('precision:', precision_at_k(dfv, dfv, relevancy_method='top_k', k=5, col_user='voter', col_item='proposal'))
print('recall:', recall_at_k(dfv, dfv, relevancy_method='top_k', k=5, col_user='voter', col_item='proposal'))

map: 0.741011513447713
ndcg: 1.0
precision: 0.6418547055586131
recall: 0.741011513447713
