# Hedera NFT Reccomendation System data gathering
DSC 478 Hanlon & Ustura (Data last pulled from the Hedera Mirror Node API on 11/17/2023)

In [1]:
import pandas as pd
import numpy as np
import json

Read in data from gather.js output files and save a file of unique accounts

In [2]:
accounts = {}

chunk_size = 10000

# read file in chunks and collect unique account along with their token balances
for chunk in pd.read_csv('balances.txt', chunksize=chunk_size, header=None, names=['account_id', 'token_id', 'balance']):
    for _, row in chunk.iterrows():
        account_id = row['account_id']
        token_id = row['token_id']
        balance = row['balance']
        if account_id not in accounts:
            accounts[account_id] = {}
        if token_id not in accounts[account_id]:
            accounts[account_id][token_id] = 0
        accounts[account_id][token_id] += balance

print('Unique Accounts:', len(accounts))

# Save the unique account to a file
with open('unique_accounts.txt', 'w') as file:
    for account_id in accounts:
        file.write(account_id + '\n')

Unique Accounts: 632203


Load all qualifying tokens from json file

In [3]:
with open('all_tokens.json', 'r') as json_file:
    tokens = json.load(json_file)

print('Unique Tokens:', len(tokens))

Unique Tokens: 1404


Testing an account against the network to make sure data is accurate

In [4]:
my_account = accounts['0.0.509323']
print(len(my_account))
for token_id, balance in my_account.items():
    print(f"Token ID: {token_id}, Balance: {balance}")

8
Token ID:  0.0.511406, Balance: 39
Token ID:  0.0.511414, Balance: 32
Token ID:  0.0.562847, Balance: 13
Token ID:  0.0.563000, Balance: 13
Token ID:  0.0.600311, Balance: 32
Token ID:  0.0.600321, Balance: 30
Token ID:  0.0.911740, Balance: 654
Token ID:  0.0.1099364, Balance: 37


Drop Accounts if less than 5 unique tokens owned or if a token balance exceeds or equals 500

In [5]:
to_delete = []
to_delete_2 = []
for account in accounts:
    curr = accounts[account]
    if len(curr) < 5:
        to_delete.append(account)
        #print(account, len(curr))

for acct_to_del in to_delete:
    del accounts[acct_to_del]

for account in accounts:
    curr = accounts[account]
    for token, bal in curr.items():
        if bal >= 500:
            #print(account, token, bal)
            if account not in to_delete_2:
                to_delete_2.append(account)

for acct_to_del2 in to_delete_2:
    del accounts[acct_to_del2]

print('Unique Accounts:', len(accounts))

Unique Accounts: 15760


Check + add name

In [6]:
tokens.insert(0, 'account id')
print(len(tokens))
print(tokens[:3])


1405
['account id', '0.0.475254', '0.0.475255']


Write header to csv

In [7]:
tokens_df = pd.DataFrame(columns=tokens)
tokens_df.to_csv('data.csv', header=True, index=False)
tokens_df

Unnamed: 0,account id,0.0.475254,0.0.475255,0.0.475256,0.0.488087,0.0.495216,0.0.497204,0.0.505777,0.0.506899,0.0.508627,...,0.0.6399048,0.0.6413488,0.0.6446451,0.0.6453983,0.0.6562546,0.0.6605459,0.0.6688389,0.0.6689352,0.0.6712571,0.0.6739638


Append account id and corresponding balances to tokens to csv file for all accounts and their balances

In [8]:
for account in accounts:
    account_id_df = pd.DataFrame([[account_id]], columns=['account_id'])
    account_id_df
    currAccount = accounts[account]
    row = np.zeros(len(tokens)-1, dtype=object)
    row = np.insert(row, 0, account)
    for token_id, balance in currAccount.items():
        token_idx = tokens.index(token_id.strip())
        row[token_idx] = balance
        #print(account, token_id, balance)
    #print(account, np.unique(row, return_index=True))
    row_df = pd.DataFrame(row)
    row_df = row_df.T
    #row_df = row_df.astype(int)
    #print(row_df.head(10))
    row_df.to_csv('data.csv', mode='a', header=False, index=False)

data.csv now contains our final structured, filtered, and cleaned data