In [1]:
import os
from pathlib import Path
import random

import datetime as dt
import itertools as it

from collections import namedtuple

from tqdm.notebook import tqdm # Progress bars

# https://import-as.github.io
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing as pp

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

DW = Path("/kaggle/input/dao-analyzer")



Hyperparameters table in [Google Drive](https://docs.google.com/spreadsheets/d/1riafpWt1563w9pbqdt1g2QZVkc7TfRWGzFaCG5rudDI/edit?usp=sharing)

# Reading data

In [2]:
with open(DW/'version.txt', 'r') as f:
    print(f.readline().strip())
    
with open(DW/'update_date.txt', 'r') as f:
    UPDATE_DATE = dt.datetime.fromisoformat(f.readline().strip())
    print(UPDATE_DATE)
    
def get_df(name):
    df = pd.read_csv(DW / "daostack" / f'{name}.csv', index_col='id')
    df = df.drop(columns=['Unnamed: 0'])

    time_cols = [col for col in df.columns if col.endswith('At')]
    df[time_cols] = df[time_cols].apply(pd.to_datetime, errors='coerce', unit='s', origin='unix')
    df['network'] = df['network'].astype('category')
    
    return df

def filter_df(df: pd.DataFrame):
    return df[df['dao'].isin(TESTING_DAO_IDS)]

1.1.12.post2
2023-07-21 01:59:36.103747


In [3]:
dfd = get_df('daos')

TESTING_DAO_NAMES = {'dxDAO', 'xDXdao'}
TESTING_DAO_IDS = set(dfd[dfd['name'].isin(TESTING_DAO_NAMES)].index)
MIN_VOTES = 6

In [4]:
def _print_dfv_summary(df):
    print(f"Contamos con {len(df)} votos en {df['proposal'].nunique()} propuestas, por {df['voter'].nunique()} usuarios únicos")

# dfp stands for dataframe proposals
dfp = filter_df(get_df('proposals'))
dfv = filter_df(get_df('votes'))
dfh = filter_df(get_df('reputationHolders'))

CAT_COLS = ['voter', 'dao', 'proposal']
dfv[CAT_COLS] = dfv[CAT_COLS].astype('category')

_print_dfv_summary(dfv)

vpu = dfv.groupby('voter').size()
def _aux_n_users(x):
    print(f"Hay solo {sum(vpu >= x)} ({(vpu > x).mean()*100:.2f}%) usuarios con más de {x} votos")
_aux_n_users(3)
_aux_n_users(6)
_aux_n_users(12)

# Nos quedamos solo con los usuarios que tienen más de 3 votos
dfv = dfv[dfv['voter'].isin(vpu[vpu >= MIN_VOTES].index)]

dfv

Contamos con 8479 votos en 2226 propuestas, por 193 usuarios únicos
Hay solo 129 (60.62%) usuarios con más de 3 votos
Hay solo 104 (51.30%) usuarios con más de 6 votos
Hay solo 75 (36.79%) usuarios con más de 12 votos


Unnamed: 0_level_0,network,createdAt,voter,outcome,reputation,dao,proposal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0x000682a038b22925343bd5b9e84acb424a9d148843261372dc83514589bfdc3b,mainnet,2023-03-14 02:47:23,0x91628ddc3a6ff9b48a2f34fc315d243eb07a9501,Pass,90248500000000000000000,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb2a2b6baeed346c4e49a241044bd547504d02a3b593a...
0x001cde4d98e26191fc34308be3b51ce50d3eb9561b1b01b768722bf9abe04457,mainnet,2022-05-22 02:09:49,0xabd238fa6b6042438fbd22e7d398199080b4224c,Pass,68359641109260095429647,0x519b70055af55a007110b4ff99b0ea33071c720a,0x53c9666338692a720a3ee3841b49f58a4d580cf7ed11...
0x003b53eeeb314ab7fa42f88f8630d7fd6ea17184c1f05a68abba789c881b9119,mainnet,2020-08-28 12:12:34,0xb0e83c2d71a991017e0116d58c5765abc57384af,Pass,18016183440825327276060,0x519b70055af55a007110b4ff99b0ea33071c720a,0xdd9f7b1cbd148a266c6a6edbcf271ea0248bfa7f9874...
0x003e74b43ea9d833d524b82c47c4e1b2b02f84506bd233382840a5f63add4e3d,mainnet,2023-03-26 15:16:47,0x91628ddc3a6ff9b48a2f34fc315d243eb07a9501,Pass,90248500000000000000000,0x519b70055af55a007110b4ff99b0ea33071c720a,0x2f701be3fbd3e7e706f1aac4d36839c18a56a52e45e5...
0x0057ad41e781a0acd4e7a06be5cea9dc3fcd5fc259ff409b8ed4c504e01cfd8a,mainnet,2022-12-02 20:13:47,0x8e900cf9bd655e34bb610f0ef365d8d476fd7337,Pass,86679381514995439609053,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb7ff31ec2bcdb8a254b8a1b06c6d2080a4089135b03e...
...,...,...,...,...,...,...,...
0xffd8ca7f39b9cf2c8de1914306c1b69e78a958ba90dbd73b0bb6332d3771650e,xdai,2021-05-31 10:15:35,0x6f7c864cc0fc9fb2fe26f53c031d3deec0b8d7d5,Pass,4503667578035201114112,0xe716ec63c5673b3a4732d22909b38d779fa47c3f,0x84d7402de697479a586620cda052c2b6311238d8c9a0...
0xffe4fb38b16e8963340aa482be165fb4d650f321975e5f5faa3aa0e9abbf9528,xdai,2021-05-09 07:15:35,0x5a3992044a131c2f633394065c13ba1b33cdffd9,Pass,17609542272000000000000,0xe716ec63c5673b3a4732d22909b38d779fa47c3f,0x5f50da5227bbb459f6f5a78be5d2cf0000da5c7ceca1...
0xffe62001e3e32cabf75e660f91057fc7ed13232c3260f4d5e1469259d62975d9,xdai,2021-10-26 18:37:55,0xc4a69fbf4511a1377161834cb7a3b8766953db02,Pass,372907972512994918632,0xe716ec63c5673b3a4732d22909b38d779fa47c3f,0xbf65ac67817ace37cc73d62b507336c8adc41454358c...
0xfff1158d5254e39a4f9f1fb14f10c28e77c730f6d608c766926a6791dd9ffaa2,xdai,2022-05-30 08:49:45,0xd97672177e0673227fa102c91bfa8b8cfa825141,Pass,42056811887981891058110,0xe716ec63c5673b3a4732d22909b38d779fa47c3f,0xe7a1d63f8041725d87fff3443ab5a64f5df60fe8ce54...


In [5]:
dfp_nlp = pd.read_csv('/kaggle/input/dao-proposals-topic-models/dfp_nlp.csv')
# dfp_nlp

## Train and test split

We can't just split the dataset uniformly, as we want every user to appear both in training and testing, as it would happen in real life. We can analyze cold starts later.

Furthermore, to better simulate irl conditions, instead of grabbing a random subset, we will sort the votes chronologically, and recommend the other proposals.

In [6]:
# Based on Microsoft's recommenders repository (MIT License)
DEFAULT_USER_COL = 'voter'
DEFAULT_ITEM_COL = 'proposal'
DEFAULT_TIMESTAMP_COL = 'createdAt'

# https://github.com/microsoft/recommenders/blob/main/recommenders/datasets/split_utils.py
def process_split_ratio(ratio):
    """Generate split ratio lists.

    Args:
        ratio (float or list): a float number that indicates split ratio or a list of float
        numbers that indicate split ratios (if it is a multi-split).

    Returns:
        tuple:
        - bool: A boolean variable multi that indicates if the splitting is multi or single.
        - list: A list of normalized split ratios.
    """
    if isinstance(ratio, float):
        if ratio <= 0 or ratio >= 1:
            raise ValueError("Split ratio has to be between 0 and 1")

        multi = False
    elif isinstance(ratio, list):
        if any([x <= 0 for x in ratio]):
            raise ValueError(
                "All split ratios in the ratio list should be larger than 0."
            )

        # normalize split ratios if they are not summed to 1
        if math.fsum(ratio) != 1.0:
            ratio = [x / math.fsum(ratio) for x in ratio]

        multi = True
    else:
        raise TypeError("Split ratio should be either float or a list of floats.")

    return multi, ratio

# https://github.com/microsoft/recommenders/blob/main/recommenders/datasets/python_splitters.py
def _do_stratification(
    data,
    ratio=0.75,
    filter_by="user",
    is_random=True,
    seed=42,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    # A few preliminary checks.
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if col_user not in data.columns:
        raise ValueError("Schema of data not valid. Missing User Col")

    if col_item not in data.columns:
        raise ValueError("Schema of data not valid. Missing Item Col")

    if not is_random:
        if col_timestamp not in data.columns:
            raise ValueError("Schema of data not valid. Missing Timestamp Col")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    ratio = ratio if multi_split else [ratio, 1 - ratio]

    if is_random:
        np.random.seed(seed)
        data["random"] = np.random.rand(data.shape[0])
        order_by = "random"
    else:
        order_by = col_timestamp

    data = data.sort_values([split_by_column, order_by])

    groups = data.groupby(split_by_column)

    data["count"] = groups[split_by_column].transform("count")
    data["rank"] = groups.cumcount() + 1

    if is_random:
        data = data.drop("random", axis=1)

    splits = []
    prev_threshold = None
    for threshold in np.cumsum(ratio):
        condition = data["rank"] <= round(threshold * data["count"])
        if prev_threshold is not None:
            condition &= data["rank"] > round(prev_threshold * data["count"])
        splits.append(data[condition].drop(["rank", "count"], axis=1))
        prev_threshold = threshold

    return splits

def pandas_chrono_split(df, **kwargs):
    return _do_stratification(df, is_random=False, **kwargs)

def pandas_stratified_split(df, **kwargs):
    return _do_stratification(df, is_random=True, **kwargs)

In [7]:
_print_dfv_summary(dfv)

Contamos con 8303 votos en 2216 propuestas, por 104 usuarios únicos


In [8]:
train_df, test_df = _do_stratification(dfv, is_random=True, ratio=0.75)
print(f"train: {len(train_df)}, test: {len(test_df)}, real ratio: {len(train_df)/(len(train_df)+len(test_df))}")
test_df = test_df[
    (test_df['voter'].isin(train_df['voter'].unique())) & \
    (test_df['proposal'].isin(train_df['proposal'].unique()))
]
print(f"after pruning: train: {len(train_df)}, test: {len(test_df)}, real ratio: {len(train_df)/(len(train_df)+len(test_df))}")
_print_dfv_summary(train_df)
_print_dfv_summary(test_df)

train: 6218, test: 2085, real ratio: 0.7488859448392148
after pruning: train: 6218, test: 1846, real ratio: 0.7710813492063492
Contamos con 6218 votos en 2034 propuestas, por 104 usuarios únicos
Contamos con 1846 votos en 1050 propuestas, por 104 usuarios únicos


In [9]:
encoder_user = pp.LabelEncoder()
encoder_prop = pp.LabelEncoder()

train_df['uid'] = encoder_user.fit_transform(train_df['voter'])
test_df['uid'] = encoder_user.transform(test_df['voter'])

N_USERS = len(encoder_user.classes_)

train_df['pid'] = encoder_prop.fit_transform(train_df['proposal'])
test_df['pid'] = encoder_prop.transform(test_df['proposal'])

N_ITEMS = len(encoder_prop.classes_)
N_ITEMS, N_USERS

(2034, 104)

# Transform data to tensors

In [10]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25ldone
[?25h  Created wheel for torch_geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910459 sha256=8491abb4e8e760c84509f2a91b29a6a59627dd94b50b696043b2bb4b81e3828c
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch_geometric
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.3.1


In [11]:
from tqdm.autonotebook import tqdm, trange

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric as PyG
from torch_geometric.nn.conv import MessagePassing

  from tqdm.autonotebook import tqdm, trange


In [12]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [13]:
u_t = torch.LongTensor(train_df['uid'])
# We shift every pid index by uid to ensure the node id is not repeated
# with the user node id
p_t = torch.LongTensor(train_df['pid']) + N_USERS

# Because it is a bidirectional graph, we add both u->p and p->u
edge_index = torch.stack([
    torch.cat([u_t, p_t]),
    torch.cat([p_t, u_t]),
]).to(device)

edge_index.shape

torch.Size([2, 12436])

In [14]:
BatchSample = namedtuple('BatchSample', 'users pos neg')
def batch_sample(df, batch_size: int, n_users: int = N_USERS, n_items: int = N_ITEMS) -> BatchSample:
    """ Returns a sample of positive and negative indices """
    if batch_size > n_users:
        raise ValueError("Sample size should be smaller than total number of users")
        
    def _sample_neg(x):
        """ Returns an uid that is not in x """
        return random.choice(pd.RangeIndex(n_users).difference(x))
        
    voted = df.groupby('uid')['pid'].apply(list).reset_index()
        
    users = voted.sample(batch_size).index.sort_values()
    users_df = pd.DataFrame(users, columns=['uid'])
    
    voted = pd.merge(voted, users_df, how='right', on=['uid'])
    # in the target class (voted)
    pos = voted['pid'].apply(lambda x: random.choice(x)).values
    # not in the target class (not voted)
    neg = voted['pid'].apply(_sample_neg).values
    
    return BatchSample(
        users=torch.LongTensor(users).to(device),
        # Adding n_users as it is an item index
        pos=torch.LongTensor(pos).to(device) + n_users,
        neg=torch.LongTensor(neg).to(device) + n_users,
    )

batch_sample(train_df, 16)

BatchSample(users=tensor([ 14,  27,  35,  40,  55,  67,  76,  78,  83,  84,  88,  92,  97,  98,
        100, 101]), pos=tensor([ 638, 2021, 1537, 1750, 2114, 1347,  188,  436, 1181, 1908, 1635, 1804,
        1687, 2012, 1797, 1580]), neg=tensor([167, 111, 171, 127, 126, 165, 193, 134, 203, 148, 129, 137, 164, 137,
        136, 116]))

# Creating the model

We will implement LightGCN in PyG

In [15]:
EPOCHS = 50
BATCH_SIZE = 16
LEARNING_RATE = 0.0001
DECAY = 0.0001
# Latent space dimensions
EMBEDDING_DIM = 32

In [16]:
# From https://medium.com/stanford-cs224w/recommender-systems-with-gnns-in-pyg-d8301178e377
# and https://github.com/microsoft/recommenders/blob/main/recommenders/models/deeprec/models/graphrec/lightgcn.py

class LightGCNConv(MessagePassing):
    def __init__(self):
        super().__init__(aggr='add')
    
    # def aggregate(self, x, messages, index):
    #     return torch_scatter.scatter(messages, index, self.node_dim, reduce='sum')
    
    def forward(self, x, edge_index):
        # Compute normalization
        from_, to_ = edge_index
        deg = PyG.utils.degree(to_, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[from_] * deg_inv_sqrt[to_]
        # Start propagating messages
        return self.propagate(edge_index, x=x, norm=norm)

    def message(self, x_j, norm):
        return norm.view(-1, 1) * x_j
    
# Define model
class LightGCN(nn.Module):
    def __init__(self, n_users: int, n_items: int, num_layers: int, embedding_dim: int = EMBEDDING_DIM):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        
        self.embedding = nn.Embedding(n_users + n_items, embedding_dim)
        
        self.layers = nn.ModuleList(it.repeat(LightGCNConv(), num_layers))
        nn.init.normal_(self.embedding.weight, std=0.1)
    
    def forward(self, edge_index):
        emb = self.embedding.weight
        embs = [self.embedding.weight]
        for conv in self.layers:
            emb = conv(x=emb, edge_index=edge_index)
            embs.append(emb)
        
        # perform weighted sum on output of all layers to yield final embedding
        out = torch.mean(torch.stack(embs, dim=0), dim=0)
        return emb, out
    
model = LightGCN(n_users=N_USERS, n_items=N_ITEMS, num_layers=3, embedding_dim=EMBEDDING_DIM)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
model

LightGCN(
  (embedding): Embedding(2138, 64)
  (layers): ModuleList(
    (0-2): 3 x LightGCNConv()
  )
)

In [17]:
def _assert_eq(x, y):
    import sys
    if x != y:
         print(f"Expected {x}, got {y}", file=sys.stderr)

class Metrics:
    def __init__(self, model, Ks=[6,12]):
        super().__init__()
        self.model = model
        
        if isinstance(Ks, int):
            self.Ks = [Ks]
        else:
            self.Ks = Ks
        
        self.epochs = 0
        self.batches = 0
        self.precision = {k:[] for k in self.Ks}
        self.recall = {k:[] for k in self.Ks}
        self.train_precision = {k:[] for k in self.Ks}
        self.train_recall = {k:[] for k in self.Ks}
        self.bpr_loss = []
        self.reg_loss = []
        self._batch_bpr_loss = []
        self._batch_reg_loss = []
    
    def update(self):
        _, out = model(edge_index)
        user_embed, item_embed = torch.split(out, (self.model.n_users, self.model.n_items))
        
        ### Metrics
        relevance_score = torch.matmul(user_embed, torch.transpose(item_embed, 0, 1))
        # get all user-item interactions (ground truth)
        i = torch.stack([
            torch.LongTensor(train_df['uid'].values),
            torch.LongTensor(train_df['pid'].values),
        ])
        v = torch.ones(len(train_df), dtype=torch.float64)
        t_interactions = torch.sparse.FloatTensor(i, v, (self.model.n_users, self.model.n_items)).to_dense().to(device)
        # mask out training user-item interactions from metric computation
        # We are only interested in novel items, as a user won't be interested
        # in "voting again"
        msk_relevance_score = torch.mul(relevance_score, (1 - t_interactions))
        
        # Nevertheless, to test training accuracy, we will check if the suggested interactions
        # are correct, even if they already appeared
        
        train_interacted_items = train_df.groupby('uid')['pid'].apply(list).reset_index()
        test_interacted_items  =  test_df.groupby('uid')['pid'].apply(list).reset_index()
        
        for K in self.Ks:
            topk_relevance_indices_msk = torch.topk(msk_relevance_score, K).indices
            topk_relevance_indices = torch.topk(relevance_score, K).indices
            
            def _userPrecRec(user, scores):
                # assert len(topk_relevance_indices[user['uid']]) == K
                
                # The .tolist() was more important than I thought T.T
                common = set(user['pid']).intersection(scores[user['uid']].tolist())
                # print("##########")
                # print(user['uid'])
                # print(user['pid'])
                # print(topk_relevance_indices[user['uid']])
                ncommon = len(common)
                
                # user['precision'] = ncommon / K
                user['precision'] = ncommon / min(K, len(user['pid']))
                user['recall'] = ncommon / len(user['pid'])
                
                return user
            
            train_interacted_k = train_interacted_items.apply(_userPrecRec, args=(topk_relevance_indices,), axis=1)
            interacted_k = test_interacted_items.apply(_userPrecRec, args=(topk_relevance_indices_msk,), axis=1)
            
            self.train_precision[K].append(train_interacted_k['precision'].mean())
            self.train_recall[K].append(train_interacted_k['recall'].mean())
            self.precision[K].append(interacted_k['precision'].mean())
            self.recall[K].append(interacted_k['recall'].mean())
            
            """
            ############
            relevance_score = torch.matmul(user_embed, torch.transpose(item_embed,0, 1))

            # create dense tensor of all user-item interactions
            i = torch.stack((
                torch.LongTensor(train_df['uid'].values),
                torch.LongTensor(train_df['pid'].values)
            ))
            v = torch.ones((len(train_df)), dtype=torch.float64)
            interactions_t = torch.sparse.FloatTensor(i, v, (self.model.n_users, self.model.n_items))\
                  .to_dense().to(device)

            # mask out training user-item interactions from metric computation
            relevance_score = torch.mul(relevance_score, (1 - interactions_t))

            # compute top scoring items for each user
            topk_relevance_indices = torch.topk(relevance_score, K).indices
            topk_relevance_indices_df = pd.DataFrame(topk_relevance_indices.cpu().numpy(),columns =['top_indx_'+str(x+1) for x in range(K)])
            topk_relevance_indices_df['user_ID'] = topk_relevance_indices_df.index
            topk_relevance_indices_df['top_rlvnt_itm'] = topk_relevance_indices_df[['top_indx_'+str(x+1) for x in range(K)]].values.tolist()
            topk_relevance_indices_df = topk_relevance_indices_df[['user_ID','top_rlvnt_itm']]

            # measure overlap between recommended (top-scoring) and held-out user-item
            # interactions
            test_interacted_items = test_df.groupby('uid')['pid'].apply(list).reset_index()
            metrics_df = pd.merge(test_interacted_items,topk_relevance_indices_df, how= 'left', left_on = 'uid',right_on = ['user_ID'])
            metrics_df['intrsctn_itm'] = [list(set(a).intersection(b)) for a, b in zip(metrics_df['pid'], metrics_df.top_rlvnt_itm)]

            metrics_df['recall'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/len(x['pid']), axis = 1)
            metrics_df['precision'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/K, axis = 1)
            
            _assert_eq(metrics_df['precision'].mean(), interacted_k['precision'].mean())
            # assert interacted_k['recall'].mean() == metrics_df['recall'].mean()
            
            ###
            relevance_score = torch.matmul(user_embed, torch.transpose(item_embed,0, 1))

            # create dense tensor of all user-item interactions
            i = torch.stack((
                torch.LongTensor(train_df['uid'].values),
                torch.LongTensor(train_df['pid'].values)
            ))
            v = torch.ones((len(train_df)), dtype=torch.float64)
            interactions_t = torch.sparse.FloatTensor(i, v, (self.model.n_users, self.model.n_items))\
                  .to_dense().to(device)

            # mask out training user-item interactions from metric computation
            relevance_score = torch.mul(relevance_score, (1 - interactions_t))

            # compute top scoring items for each user
            topk_relevance_indices = torch.topk(relevance_score, K).indices
            topk_relevance_indices_df = pd.DataFrame(topk_relevance_indices.cpu().numpy(),columns =['top_indx_'+str(x+1) for x in range(K)])
            topk_relevance_indices_df['user_ID'] = topk_relevance_indices_df.index
            topk_relevance_indices_df['top_rlvnt_itm'] = topk_relevance_indices_df[['top_indx_'+str(x+1) for x in range(K)]].values.tolist()
            topk_relevance_indices_df = topk_relevance_indices_df[['user_ID','top_rlvnt_itm']]

            # measure overlap between recommended (top-scoring) and held-out user-item
            # interactions
            test_interacted_items = train_df.groupby('uid')['pid'].apply(list).reset_index()
            metrics_df = pd.merge(test_interacted_items,topk_relevance_indices_df, how= 'left', left_on = 'uid',right_on = ['user_ID'])
            metrics_df['intrsctn_itm'] = [list(set(a).intersection(b)) for a, b in zip(metrics_df['pid'], metrics_df.top_rlvnt_itm)]

            metrics_df['recall'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/len(x['pid']), axis = 1)
            metrics_df['precision'] = metrics_df.apply(lambda x : len(x['intrsctn_itm'])/K, axis = 1)
            
            # print(metrics_df['precision'].describe())
            # print(metrics_df['recall'].describe())
            # print(train_interacted_k)
            # print(metrics_df)
            # assert train_interacted_k['precision'].equals(metrics_df['precision'])
            # assert train_interacted_k['recall'].equals(metrics_df['recall'])
            ######
            """
        
        self.bpr_loss.append(np.mean(self._batch_bpr_loss))
        self.reg_loss.append(np.mean(self._batch_reg_loss))
        self._batch_bpr_loss = []
        self._batch_reg_loss = []
        
        self.epochs += 1
        
    def compute_loss(self, users, pos_items, neg_items):
        emb, out = model(edge_index)
        
        users_emb = out[users]
        pos_emb = out[pos_items]
        neg_emb = out[neg_items]
        userEmb0 = emb[users]
        posEmb0 = emb[pos_items]
        negEmb0 = emb[neg_items]
        
        # compute loss from initial embeddings, used for regulization
        reg_loss = (1 / 2) * (
            userEmb0.norm().pow(2) + 
            posEmb0.norm().pow(2)  +
            negEmb0.norm().pow(2)
        ) / float(len(users))

        # compute BPR loss from user, positive item, and negative item embeddings
        pos_scores = torch.mul(users_emb, pos_emb).sum(dim=1)
        neg_scores = torch.mul(users_emb, neg_emb).sum(dim=1)

        bpr_loss = torch.mean(F.softplus(neg_scores - pos_scores))
        
        reg_loss = DECAY * reg_loss
        final_loss = bpr_loss + reg_loss
        
        final_loss.backward()
        
        self._batch_bpr_loss.append(float(bpr_loss))
        self._batch_reg_loss.append(float(reg_loss))
        
        self.batches += 1
        
    def plot_loss(self):
        fig = px.line(metrics.bpr_loss, title=f'BPR loss of model', log_y=True)
        fig.update_layout(
            xaxis_title='epoch',
            yaxis_title='loss',
        )
        return fig
    
    def plot_precision(self, recall=False, train=False):
        df = pd.DataFrame(self.precision).rename(columns='precision@{}'.format)
        
        fig = px.line(df)
        fig.update_layout(
            xaxis_title='epochs',
            yaxis_title='value',
        )
        
        colors = [d.line.color for d in fig.data]
        
        if recall:
            for c, (k,v) in zip(colors, self.recall.items()):
                fig.add_trace(go.Scatter(x=np.arange(len(v)), y=v, name=f'recall@{k}', mode='lines', line={'dash':'dash', 'color': c}))
        
        if train:
            for c, (k,v) in zip(colors, self.train_precision.items()):
                fig.add_trace(go.Scatter(x=np.arange(len(v)), y=v, name=f'ptrain@{k}', mode='lines', line={'dash':'dot'}))
        
        return fig
    
    def plot_recall(self):
        df = pd.DataFrame(self.recall).rename(columns='recall@{}'.format)
        fig = px.line(df)
        fig.update_layout(
            xaxis_title='epochs',
            yaxis_title='value',
        )
        return fig
    
    def __repr__(self):
        return str(self.__dict__)

In [18]:
def batcher(df, batch_size):
    return (df[pos:pos+batch_size] for pos in range(0, len(df), batch_size))

def train_eval(model, optimizer, train_df):
    metrics = Metrics(model)
    
    try:
        for epoch in trange(EPOCHS):
            model.train()
            for batch in batcher(train_df, BATCH_SIZE):
                optimizer.zero_grad()
                users, pos_items, neg_items = batch_sample(train_df, BATCH_SIZE, model.n_users, model.n_items)

                # Inside this, there is the loss.backwards() method
                # Perhaps we should uncouple metrics and loss calculation
                # even though we store the loss in an array to print it later
                metrics.compute_loss(users, pos_items, neg_items)
                optimizer.step()

            model.eval()
            with torch.no_grad():
                metrics.update()
                print(metrics.bpr_loss[-1], metrics.train_precision[6][-1], metrics.precision[6][-1])
    except KeyboardInterrupt:
        pass
        
    return metrics
        
metrics = train_eval(model, optimizer, train_df)
display(metrics.plot_loss())
display(metrics.plot_precision(train=True))

  0%|          | 0/50 [00:00<?, ?it/s]

0.6818640776036025 0.7334935897435897 0.1048076923076923
0.6720538756841252 0.7157051282051282 0.21009615384615385
0.6516615742276444 0.5897435897435896 0.22211538461538463
0.6168976024674266 0.5334935897435897 0.23653846153846156
0.5687947711478775 0.4772435897435897 0.24134615384615385
0.5103954447572213 0.4463141025641026 0.23461538461538461
0.45459153581707523 0.4115384615384616 0.241025641025641
0.4008618189467256 0.4016025641025641 0.24423076923076922
0.3486962034187464 0.3913461538461539 0.24423076923076922
0.3076630240847335 0.3806089743589744 0.24775641025641024
0.27051998339689176 0.38221153846153855 0.2557692307692308
0.23956482598898038 0.38461538461538464 0.2557692307692308
0.21150586717692624 0.3814102564102564 0.25416666666666665
0.18949719855518757 0.3798076923076923 0.2557692307692308
0.16702251212915603 0.37820512820512825 0.2525641025641026
0.15308056124378291 0.37259615384615385 0.2525641025641026
0.13656924905346415 0.37580128205128205 0.25096153846153846
0.1264196

# Using all of this

Crearé una función que reciba una dirección de un usuario y retorne k propuestas que puedan interesarle

In [19]:
def recommend(user: str, K: int = 12, ignore_train: bool=False):
    uid = encoder_user.transform([user])[0]
    print(f"Recommending {K} proposals for user {user} (uid:{uid}) with {vpu.at[user]} votes")
    
    # Getting embedding
    _, out = model(edge_index)
    user_embed, item_embed = torch.split(out, (model.n_users, model.n_items))
    relevance_score = torch.matmul(user_embed, torch.transpose(item_embed, 0, 1))
    if ignore_train:
        i = torch.stack([
            torch.LongTensor(train_df['uid'].values),
            torch.LongTensor(train_df['pid'].values),
        ])
        v = torch.ones(len(train_df), dtype=torch.float64)
        t_interactions = torch.sparse.FloatTensor(i, v, (model.n_users, model.n_items)).to_dense().to(device)
        # mask out training user-item interactions from metric computation
        # We are only interested in novel items, as a user won't be interested
        # in "voting again"
        relevance_score = torch.mul(relevance_score, (1 - t_interactions))
    
    topk_relevance_indices = torch.topk(relevance_score, K).indices
    
    pids = topk_relevance_indices[uid].tolist()
    proposals = dfp.loc[encoder_prop.inverse_transform(pids)]
    
    proposals['userVoted'] = dfv.groupby('proposal')['voter'].apply(lambda x: user in set(x))
    
    print(f"precision@{K}={sum(proposals['userVoted'])/len(proposals)*100:.2f}%")
    
    return proposals

user = "0x334f12afb7d8740868be04719639616533075234" # vpu[(12 < vpu) & (vpu < 38)].sample().index[0]
recommend(user, ignore_train=True)[['network', 'createdAt', 'title', 'description', 'userVoted']]

Recommending 12 proposals for user 0x334f12afb7d8740868be04719639616533075234 (uid:18) with 35 votes
precision@12=16.67%


Unnamed: 0_level_0,network,createdAt,title,description,userVoted
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0x52e92a9757df27398403eeced20ee15ae0d5f41f7ea0f956d728d62a7b3f07c5,mainnet,2019-09-16 17:40:16,BC-DAPP Milestones 1 & 2 (dOrg Payout),BC-DAPP Proposal\nThis proposal is to signal a...,False
0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbff9e95873d82c0314534e,mainnet,2019-07-14 14:48:30,Give the power to me,I will make the dxDAO great again!,False
0x59fc60a67b7e89175815610632b9f660a4bbb1e6f40f30dc64590abb5bd7af5e,mainnet,2019-09-16 17:50:28,BC-DAPP Milestones 1 & 2 (Corkus payout),BC-DAPP Proposal\nThis proposal is to signal a...,False
0xb53bf37ddc953a6b764c7b6fd6b058765b9494a17dffc92068229c746eb1cf72,mainnet,2019-07-25 18:32:21,De-whitelist POLY from MGN generating tokens,I am offering de-whitelist POLY from MGN gener...,False
0x74cbda473a10059324404c16da7ebfb95926da34354607fc89ae2629942e2a24,mainnet,2019-08-13 18:18:06,Add Wrapped BTC - WBTC to whitelist for MGN ge...,Add Wrapped BTC - WBTC to whitelist for MGN ge...,True
0xc343060bc1a1d6669d24875f0998e657ed1ef57cae42edec3971ec657c755324,mainnet,2019-07-31 04:55:17,Support Registering dutchx.eth with ENS,To deploy the DutchX in a fully decentralized ...,False
0xeb9cf2b3d76664dc1e983137f33b2400ad11966b1d79399d7ca55c25ad6283fa,mainnet,2020-04-17 19:08:38,Finalizing Fundraising Configuration,"Hello community,\n\nThis signal proposal ratif...",False
0x2bc422bab43d296c6494dcd78bc28a3b10074de875817339efa569af4c383441,mainnet,2019-07-25 17:18:45,Whitelist Compound's cDAI,cDAI is compound's tokenized interest bearing ...,True
0xb804dbb6ba463b704c5be53c3be122e0f9fd72ebd1d952eb51e74db99a2d7f78,mainnet,2019-07-26 17:47:12,Support Registering dxdao.eth with ENS,dxDAO requires a domain name on a decentralize...,False
0x8a02cf636fc646b82a3e2f5d9c8257a635be60607552f7b5308750afc3b1b1e0,mainnet,2019-07-16 23:51:56,Slashing Loopring Reputation,"There has been discussion around ""what to do 0...",False


In [20]:
dfv[dfv['proposal'] == '0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbff9e95873d82c0314534e']

Unnamed: 0_level_0,network,createdAt,voter,outcome,reputation,dao,proposal,random
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0x474e35da8c23aa146bb34356f22e5ca84a56e625b21d02fcee44279990fe4510,mainnet,2019-08-07 19:32:50,0x93d29542401c00f1431fd1c80b634697e5645c59,Fail,2281650741463878942454,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.982379
0x4a745d3cc92c6dacb724dae10b2752123023bd5f52ca417165f69d61c8d878b0,mainnet,2019-08-18 08:35:19,0x3dad32f81f5dc35d961d3da77d32a0a268b8db44,Fail,922534080553417311929,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.809874
0x66d5e3bc30b94b8f8a97b33d6ee20571d0b22b5577692b3e0b8c6c2ae4cfa11c,mainnet,2019-07-15 13:37:44,0x730fd267ef60b27615324b94bf0bc7ed15d52718,Fail,74703181293782997401422,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.819102
0xb6d9cd3f7da07e1bb01f4cf1b0caa3bf714fa86f67bbcf72778c15d9fcf9c652,mainnet,2019-07-17 10:04:32,0x3efd3391a0601eaa093647f911c653d77c11e3fd,Fail,183008951067523277033,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.23918
0xbf412f940e987ea402786e576e194a16818f40deee6c11c79516b3970cb560ba,mainnet,2019-08-09 00:15:49,0x6dd5f1bf1ffa6a3173e333c1588f4cdde8c6799e,Fail,664270618815092643957,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.680228
0xc178fee0b0a49fa090b05b908c326e6ae52f2585c09b2ba353138935dbce8a01,mainnet,2019-07-16 22:45:32,0x6651a0a95e7e19c13dd94cab16c91c201337b56a,Fail,3272713877515929381,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.769223
0xd16a46283e4a2d696f9c5f33f979f1f6b742dee8f96f8d95a298860099f71375,mainnet,2019-07-25 10:17:39,0xd7fe300587d41ed0e8b6a2bed5a1b2bb4fcdad9e,Fail,1830223736408604446481,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.436851
0xe0502149219a4071d10c81ed3ac75602602333d5279f8b5b9c1d9b95013a75b9,mainnet,2019-07-18 21:14:38,0xd65478656497b3388c2c930de3bc48ac0688039d,Fail,117751629644545480084,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.440323
0xe638fe0187bf3ab3e94b64d4b94de090533839436ebe767db4372bb51f00c366,mainnet,2019-07-14 22:29:44,0xe858a4bf603995a9156edbd25ff06269d997839e,Fail,26212607727907855277820,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.694151
0xeb7bf80684ebc666ac5d1279e5ac4dce6e09b32056f170f4141a8ab6c27120b1,mainnet,2019-07-15 14:16:33,0xc4d9d1a93068d311ab18e988244123430eb4f1cd,Fail,2722407747051322830,0x519b70055af55a007110b4ff99b0ea33071c720a,0xb92d2df99a47244c07a9d7ef73530c273f1d65230dbf...,0.743193
