In [2]:
%matplotlib qt
import pandas as pd
import json
import numpy as np
import os
import re
import time
from matplotlib import pyplot as plt
import ast
from PIL import Image
from datetime import datetime, timedelta
from collections import defaultdict
import requests
from collections import defaultdict
from datetime import datetime, timedelta

In [None]:
DIR = ''

In [3]:
# Load new dataset
df = pd.read_csv(f'{DIR}final_cleaned_df.csv')

In [7]:
# Load transactions
directory = f'{DIR}transactions/'
address_to_events = {}
for filename in sorted(os.listdir(directory)):  # Sort files to process sequentially
    if filename == 'non_existent_accounts.json': continue
    file_path = f'{directory}{filename}'
    with open(file_path, "r") as json_file:
        address_to_events[filename[:-5]] = json.load(json_file)

In [8]:
# Remove address with 0 events and the ones not present in new_set
new_dict = {}
for address, events in address_to_events.items():
    if len(events) == 0:
        continue
    new_dict[address] = events

In [37]:
def clean_events(address_to_events, active_users):
    """
    Cleans and formats events for each address in a dictionary.

    Args:
        address_to_events (dict): A dictionary where keys are addresses and values are lists of events.

    Returns:
        dict: A dictionary with the same keys but with cleaned and formatted event lists as values.
    """
    new_dict = {}
    total_addresses = len(address_to_events)
    for i, (address, events) in enumerate(address_to_events.items()):
        print(f'{i} / {total_addresses} - address: {address}')
        cleaned_events = []
        if address not in active_users:
            continue

        for event in events:
            try:
                if event['event_type'] == 'transfer':
                    # Clean transfer event
                    if 'from_address' not in event or 'to_address' not in event or 'nft' not in event:
                        continue  # Skip invalid transfer events
                    
                    cleaned_event = {
                        'event_type': event.get('event_type'),
                        'from': event.get('from_address'),
                        'to': event.get('to_address'),
                        'nft_identifier': event['nft'].get('identifier'),
                        'nft_collection': event['nft'].get('collection'),
                        'nft_contract': event['nft'].get('contract'),
                        'nft_name': event['nft'].get('name'),
                        'nft_description': event['nft'].get('description'),
                        'quantity': event['nft'].get('quantity', 1),  # Include quantity with default value 1
                        'datetime': datetime.fromtimestamp(event['event_timestamp']).strftime('%Y-%m-%d %H:%M:%S')
                    }
                    cleaned_events.append(cleaned_event)

                elif event['event_type'] == 'sale':
                    # Clean sale event
                    if 'seller' not in event or 'buyer' not in event or 'nft' not in event or 'payment' not in event:
                        continue  # Skip invalid sale events
                    
                    cleaned_event = {
                        'event_type': event.get('event_type'),
                        'seller': event.get('seller'),
                        'buyer': event.get('buyer'),
                        'price': float(event['payment'].get('quantity', 0)) / (10 ** event['payment'].get('decimals', 18)),
                        'currency': event['payment'].get('symbol'),
                        'transaction': event.get('transaction'),
                        'nft_identifier': event['nft'].get('identifier'),
                        'nft_collection': event['nft'].get('collection'),
                        'nft_contract': event['nft'].get('contract'),
                        'nft_name': event['nft'].get('name'),
                        'nft_description': event['nft'].get('description'),
                        'quantity': event['nft'].get('quantity', 1),  # Include quantity with default value 1
                        'datetime': datetime.fromtimestamp(event['event_timestamp']).strftime('%Y-%m-%d %H:%M:%S')
                    }
                    cleaned_events.append(cleaned_event)

            except Exception as e:
                print(f"Error cleaning event for address {address}: {e}")
                continue  # Skip problematic events

        # Update the dictionary with cleaned events
        new_dict[address] = cleaned_events

    
    return new_dict


In [38]:
# Clean dictionary

address_to_events = clean_events(new_dict, list(new_dict.keys()))

0 / 2569 - address: 0x00278990b4096bf24aa3f9591905f8070e938aeb
1 / 2569 - address: 0x0030b9f1925408d79be83c7cecfffdbacb638e9b
2 / 2569 - address: 0x003b07d8187a8df94fcae3870f8ad2817956dd3d
3 / 2569 - address: 0x008127812e0059b725f331238064721bc7938f0a
4 / 2569 - address: 0x009255de16f337807228955e9811dd20e3e69e55
5 / 2569 - address: 0x009d3f82285912ef112089a581ea1f99ed355998
6 / 2569 - address: 0x00b33204bf1328e0f967b9424f16e548913a3d7b
7 / 2569 - address: 0x00b93227d5592ef89de63c2d6cbfc8332c28072d
8 / 2569 - address: 0x00c1cbb1dcb83676b03863cb5c769cb0fb436b32
9 / 2569 - address: 0x00c8a86ed455d285d2ba23551cf341fd3ea11631
10 / 2569 - address: 0x00e210044be0aa1fa80e88186e03c38fa8cc29ac
11 / 2569 - address: 0x00fc212ccf53f99723df9d90097538cf75b44917
12 / 2569 - address: 0x010b85f64cae8d0b5672e1f1097fb5beb60606f8
13 / 2569 - address: 0x01472ab78c89dea1caa7e286651cab6f36ba8233
14 / 2569 - address: 0x015529db9b026c7b577fb13a96bfee031b832a78
15 / 2569 - address: 0x016e172309d17e7855bad9c6255

In [39]:
# Check if there are address with 0 events after the cleaning
for address, events in address_to_events.items():
    if len(events) == 0:
        print(f'{address}: {len(events)}')

In [43]:
# Save the dictionary
file_path = f'{DIR}address_to_events_final.json'
with open(file_path, "w") as json_file:
    json.dump(address_to_events, json_file)