In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

In [2]:
# this is for fetching token name later on
import json
from urllib.request import urlopen

def get_token_names(token_ids):
    """ Function that converts token id to token name (if it exists) through the hedera api """
    res = []
    for token_id in token_ids:
        url = f"https://mainnet-public.mirrornode.hedera.com/api/v1/tokens/{token_id}"
        try:
            with urlopen(url) as response:
                if response.status == 200:
                    data = json.loads(response.read().decode())
                    res.append(data.get("name", "No name found"))
                else:
                    return "Unable to fetch data"
        except Exception as e:
            return f"Error: {e}"
    return res

In [3]:
df = pd.read_csv("data.csv")

In [4]:
df

Unnamed: 0,account id,0.0.475254,0.0.475255,0.0.475256,0.0.488087,0.0.495216,0.0.497204,0.0.505777,0.0.506899,0.0.508627,...,0.0.6399048,0.0.6413488,0.0.6446451,0.0.6453983,0.0.6562546,0.0.6605459,0.0.6688389,0.0.6689352,0.0.6712571,0.0.6739638
0,0.0.471869,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0.471901,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0.471903,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0.471926,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0.471938,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15755,0.0.3700795,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15756,0.0.5194749,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15757,0.0.5167544,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15758,0.0.4552618,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
print(df.isnull().sum())

account id     0
0.0.475254     0
0.0.475255     0
0.0.475256     0
0.0.488087     0
              ..
0.0.6605459    0
0.0.6688389    0
0.0.6689352    0
0.0.6712571    0
0.0.6739638    0
Length: 1405, dtype: int64


In [6]:
def associationRate(col, tokenCol):
    """ Helper function that does the main work for finding association rates."""
    non_zero_col = np.count_nonzero(col)
    non_zero_tokenCol = np.count_nonzero(tokenCol)

    if non_zero_tokenCol > 0:
        cooccurrence = np.count_nonzero((col > 0) & (tokenCol > 0)) / non_zero_tokenCol
    else:
        cooccurrence = 0

    return cooccurrence

def associationRateDF(df, tokenCol):
    """ Function to find the association rate of a given token with all other tokens. """
    association_rate_df = df.iloc[:,2:].apply(lambda col : associationRate(col, tokenCol))
    
    return association_rate_df

In [7]:
tokenCol = df["0.0.911740"]

In [8]:
token1_association = associationRateDF(df, tokenCol)

In [9]:
token1_association.nlargest(5)

0.0.911740     1.000000
0.0.2173899    0.578313
0.0.1234197    0.506024
0.0.1099364    0.493976
0.0.1235089    0.469880
dtype: float64

In [10]:
tt_df = df.T
tt_header = tt_df.iloc[0] # grab the first row for the header
tt_df = tt_df[1:] # take the data less the header row
tt_df.columns = tt_header # set the header row as the df header
tt_df

account id,0.0.471869,0.0.471901,0.0.471903,0.0.471926,0.0.471938,0.0.471946,0.0.471952,0.0.471956,0.0.471957,0.0.471973,...,0.0.5025092,0.0.6282812,0.0.360309,0.0.4968728,0.0.6108682,0.0.3700795,0.0.5194749,0.0.5167544,0.0.4552618,0.0.5723171
0.0.475254,1,1,1,1,1,1,1,1,1,8,...,0,0,0,0,0,0,0,0,0,0
0.0.475255,1,1,1,0,0,1,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
0.0.475256,1,1,1,0,0,1,1,0,1,10,...,0,0,0,0,0,0,0,0,0,0
0.0.488087,1,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0.0.495216,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.0.6605459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.6688389,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.6689352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.6712571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
cos_sim_tokens = cosine_similarity(tt_df)
df_sim_tokens = pd.DataFrame(cos_sim_tokens, index=tt_df.index, columns=tt_df.index)
df_sim_tokens

Unnamed: 0,0.0.475254,0.0.475255,0.0.475256,0.0.488087,0.0.495216,0.0.497204,0.0.505777,0.0.506899,0.0.508627,0.0.509606,...,0.0.6399048,0.0.6413488,0.0.6446451,0.0.6453983,0.0.6562546,0.0.6605459,0.0.6688389,0.0.6689352,0.0.6712571,0.0.6739638
0.0.475254,1.000000,0.643308,0.467065,0.657447,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0.0.475255,0.643308,1.000000,0.438236,0.902272,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0.0.475256,0.467065,0.438236,1.000000,0.406811,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0.0.488087,0.657447,0.902272,0.406811,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0.0.495216,0.000000,0.000000,0.000000,0.000000,1.000000,0.045277,0.082729,0.055317,0.071815,0.063554,...,0.000000,0.007448,0.037443,0.014568,0.043549,0.059009,0.000574,0.000521,0.008950,0.008921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.0.6605459,0.000000,0.000000,0.000000,0.000000,0.059009,0.000000,0.000000,0.000000,0.005384,0.091326,...,0.004059,0.007205,0.016662,0.144441,0.196630,1.000000,0.003464,0.003146,0.001715,0.484561
0.0.6688389,0.000000,0.000000,0.000000,0.000000,0.000574,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.004441,0.016177,0.007292,0.000000,0.010378,0.003464,1.000000,0.999305,0.002251,0.000000
0.0.6689352,0.000000,0.000000,0.000000,0.000000,0.000521,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.004034,0.014696,0.006624,0.000000,0.009428,0.003146,0.999305,1.000000,0.002045,0.000000
0.0.6712571,0.000000,0.000000,0.000000,0.000000,0.008950,0.000000,0.000000,0.061208,0.004083,0.000000,...,0.003078,0.025894,0.006318,0.057150,0.092916,0.001715,0.002251,0.002045,1.000000,0.009721


In [16]:
clean_df = df
clean_df.set_index('account id', inplace=True)
clean_df.index.name = None
#aa_header = aa_df.iloc[0] # grab the first row for the header
#aa_df = aa_df[1:] # take the data less the header row
#aa_df.columns = aa_header # set the header row as the df header
clean_df

Unnamed: 0,0.0.475254,0.0.475255,0.0.475256,0.0.488087,0.0.495216,0.0.497204,0.0.505777,0.0.506899,0.0.508627,0.0.509606,...,0.0.6399048,0.0.6413488,0.0.6446451,0.0.6453983,0.0.6562546,0.0.6605459,0.0.6688389,0.0.6689352,0.0.6712571,0.0.6739638
0.0.471869,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.471901,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.471903,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.471926,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.471938,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.0.3700795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.5194749,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.5167544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0.0.4552618,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
def recommend_similar_tokens(account_id, data, cos_sim_matrix, N=5):
    # get tokens the user already owns
    owned = data.columns[data.loc[account_id] > 0]
    # dict to store potential recommendations and scores
    recommendations = {}

    for token in owned:
        # get similar tokens
        similar_tokens = cos_sim_matrix[token].sort_values(ascending=False)
        for similar_token, similarity_score in similar_tokens.items():
            # skip if the already owned
            if data.loc[account_id, similar_token] > 0:
                continue
            # update score for this token in the recs
            if similar_token in recommendations:
                recommendations[similar_token] += similarity_score
            else:
                recommendations[similar_token] = similarity_score
    # sort recs by score
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    # Return top recs
    return [token for token, score in sorted_recommendations[:N]]

In [21]:
account_id3 = '0.0.3700795'
recommended_tokens3 = recommend_similar_tokens(account_id3, clean_df, df_sim_tokens, N=8)
print(" Hello", account_id3, "- based on your current inventory you should check out these projects:\n", 'Token IDs:', recommended_tokens3, '\n', 'NFT Name', get_token_names(recommended_tokens3))

 Hello 0.0.3700795 - based on your current inventory you should check out these projects:
 Token IDs: ['0.0.6173820', '0.0.4970613', '0.0.5023918', '0.0.1783975', '0.0.6446451', '0.0.5368710', '0.0.4154809', '0.0.5335221'] 
 NFT Name ['Bonzo Singularity Collectibles', 'Concierge Collectibles', 'VIPeep Card', 'Degenlands', 'NFT Trading Bot AI', 'Peepspired Art Collection', 'Sphera Amateur', 'PeepScript Gold']
