# Task:
- Lower-cased all wallet addresses to avoid duplicate IDs.
- Dropped 7_307 exact duplicate rows (about 1.672 % of data).
- Removed rows with NaN values in columns ("from_address", "to_address", "token", "tx_hash", "block_number")
- Saved cleaned dataset as Parquet in path = ../data/processed
- Assumed that all data are wintermute related. Some transactions are not involved with wintermute addresses, but are assumed to be intermediate transactions from/to wintermute

# Initial Setup

In [105]:
import pandas as pd
pd.set_option("display.max_columns",100)

In [106]:
RAW_PATH = "../data/raw/wintermute_transfers_search_default_2025-04-08.csv"
df_raw = pd.read_csv(RAW_PATH)
df_raw.info(); df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437000 entries, 0 to 436999
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   timestamp     437000 non-null  object 
 1   from_address  436982 non-null  object 
 2   from_entity   403700 non-null  object 
 3   to_address    436974 non-null  object 
 4   to_entity     407141 non-null  object 
 5   token         436955 non-null  object 
 6   value         437000 non-null  float64
 7   usd           437000 non-null  float64
 8   tx_hash       436956 non-null  object 
 9   chain         437000 non-null  object 
 10  block_number  436956 non-null  float64
dtypes: float64(3), object(8)
memory usage: 36.7+ MB


Unnamed: 0,timestamp,from_address,from_entity,to_address,to_entity,token,value,usd,tx_hash,chain,block_number
0,2025-04-08 12:49:57+00:00,0xB1026b8e7276e7AC75410F1fcbbe21796e8f7526,Camelot,0x51C72848c68a965f66FA7a88855F9f7784502a7F,Wintermute,USDC,438.108563,438.108563,0x98058c529466064c355bdfea3cfebe399344678f20bb...,arbitrum_one,324218955.0
1,2025-04-08 12:49:54+00:00,0xcDa53B1F66614552F834cEeF361A8D12a0B8DaD8,Uniswap,0x51C72848c68a965f66FA7a88855F9f7784502a7F,Wintermute,ARB,3837.138536,1062.8797,0x405b146ac6a52b93e5a77760a4b49a87015b5c5e0659...,arbitrum_one,324218945.0
2,2025-04-08 12:49:51+00:00,0xb2cc224c1c9feE385f8ad6a55b4d94E92359DC59,Aerodrome Finance,0x51C72848c68a965f66FA7a88855F9f7784502a7F,Wintermute,WETH,7.345922,11533.612401,0x7f370fc61bed7241d846c4907a4e616e69ce60ff0192...,base,28663622.0
3,2025-04-08 12:49:51+00:00,0x51C72848c68a965f66FA7a88855F9f7784502a7F,Wintermute,0x72AB388E2E2F6FaceF59E3C3FA2C4E29011c2D38,PancakeSwap,USDC,2143.712587,2143.712587,0x88b4fb948e95cc2b9a22c48efe3d6a6788209302e72f...,base,28663622.0
4,2025-04-08 12:49:50+00:00,0x641C00A822e8b671738d32a431a4Fb6074E5c79d,Uniswap,0x51C72848c68a965f66FA7a88855F9f7784502a7F,Wintermute,WETH,0.470011,737.550466,0x39fc603c882c2f5af51b7f8cfc8eb58753f5e42eff94...,arbitrum_one,324218918.0


# Make addresses and tx_hash in lowercase 

In [107]:
df = df_raw.copy()
df["from_address"] = df["from_address"].str.strip().str.lower()
df["to_address"]   = df["to_address"].str.strip().str.lower()
df["tx_hash"]   = df["tx_hash"].str.strip().str.lower()
df.head()

Unnamed: 0,timestamp,from_address,from_entity,to_address,to_entity,token,value,usd,tx_hash,chain,block_number
0,2025-04-08 12:49:57+00:00,0xb1026b8e7276e7ac75410f1fcbbe21796e8f7526,Camelot,0x51c72848c68a965f66fa7a88855f9f7784502a7f,Wintermute,USDC,438.108563,438.108563,0x98058c529466064c355bdfea3cfebe399344678f20bb...,arbitrum_one,324218955.0
1,2025-04-08 12:49:54+00:00,0xcda53b1f66614552f834ceef361a8d12a0b8dad8,Uniswap,0x51c72848c68a965f66fa7a88855f9f7784502a7f,Wintermute,ARB,3837.138536,1062.8797,0x405b146ac6a52b93e5a77760a4b49a87015b5c5e0659...,arbitrum_one,324218945.0
2,2025-04-08 12:49:51+00:00,0xb2cc224c1c9fee385f8ad6a55b4d94e92359dc59,Aerodrome Finance,0x51c72848c68a965f66fa7a88855f9f7784502a7f,Wintermute,WETH,7.345922,11533.612401,0x7f370fc61bed7241d846c4907a4e616e69ce60ff0192...,base,28663622.0
3,2025-04-08 12:49:51+00:00,0x51c72848c68a965f66fa7a88855f9f7784502a7f,Wintermute,0x72ab388e2e2f6facef59e3c3fa2c4e29011c2d38,PancakeSwap,USDC,2143.712587,2143.712587,0x88b4fb948e95cc2b9a22c48efe3d6a6788209302e72f...,base,28663622.0
4,2025-04-08 12:49:50+00:00,0x641c00a822e8b671738d32a431a4fb6074e5c79d,Uniswap,0x51c72848c68a965f66fa7a88855f9f7784502a7f,Wintermute,WETH,0.470011,737.550466,0x39fc603c882c2f5af51b7f8cfc8eb58753f5e42eff94...,arbitrum_one,324218918.0


# Remove duplicates

In [108]:
dupes = df[df.duplicated()]
dupe_ratio = len(dupes) / len(df)
print(f"{len(dupes)} potential duplicates ({dupe_ratio:.3%})")
df = df.drop_duplicates(keep="first")
display(dupes.head())

7307 potential duplicates (1.672%)


Unnamed: 0,timestamp,from_address,from_entity,to_address,to_entity,token,value,usd,tx_hash,chain,block_number
1000,2025-04-08 12:48:01+00:00,0x76801132a22801640284cd67f7dd41fed2926b6a,Wintermute,0x16969fa79651bae11736f2f6576a86fe2726b42b,PancakeSwap,TST,27245.624283,2112.898163,0x634232a8a9e80ed40b3787e4dd829e0cb2d38b988609...,bsc,48178220.0
1002,2025-04-08 12:48:01+00:00,8vywdu14v78rcdepwmnt54bb1aam5qvumupetw8ocn1e,Wintermute,aq36qrk3hae6phqbctktqnykpt2kaagq9yoetqupmghx,Orca,USDC,54.383751,54.383751,5kymc5hchj8ckwhagcdte8xdlhlmwhjg5lzgy1kqpj5sxj...,solana,332109136.0
1003,2025-04-08 12:48:01+00:00,edqfvkrfrroisgqbmdkl6mcsnwyejjgvmylqjxktrlx,Orca,444wtf54p5mstky6qy7nmvc166ns7e5g9ccrvo9hcz2q,,ai16z,2407.280064,269.33853,2ttgszshg6k4wvarfmurvgottgsehvwmuxqjfwpec6ut6q...,solana,332109136.0
1004,2025-04-08 12:48:01+00:00,6mq8xeahdtikymvvmxuctych6dujnkgfoeib2msymmi1,Orca,ctyfgug69kwyrzk24p3uubvy1rr5atu9kf2s6xewau8x,Wintermute,WSOL,0.524908,57.592938,58gdxievnrta1yn1kwne3infptb7mdgdsxraudnrtshhqt...,solana,332109136.0
1005,2025-04-08 12:48:01+00:00,6mq8xeahdtikymvvmxuctych6dujnkgfoeib2msymmi1,Orca,ctyfgug69kwyrzk24p3uubvy1rr5atu9kf2s6xewau8x,Wintermute,WSOL,0.494767,54.285876,5kymc5hchj8ckwhagcdte8xdlhlmwhjg5lzgy1kqpj5sxj...,solana,332109136.0


# Remove rows with NaN

In [109]:
print(df.isna().sum())          

timestamp           0
from_address       18
from_entity     32743
to_address         26
to_entity       29443
token              45
value               0
usd                 0
tx_hash            44
chain               0
block_number       44
dtype: int64


In [110]:
nan_columns = ["from_address", "to_address", "token", "tx_hash", "block_number"]
mask_bad_core = df[nan_columns].isna().any(axis=1)
nan_rows = df[mask_bad_core]
df = df.dropna(subset=nan_columns)
nan_rows

Unnamed: 0,timestamp,from_address,from_entity,to_address,to_entity,token,value,usd,tx_hash,chain,block_number
2539,2025-04-08 12:44:44+00:00,,,bc1qddvmdc9t8tad2eqd7vfwv2mexjzs49f42wvg27,Wintermute,,94.299971,7528344.0,,bitcoin,
2541,2025-04-08 12:44:44+00:00,1ghn3eh6bkldfftvec91yfxmverqzy2pqy,Wintermute,,,,117.0,9340578.0,,bitcoin,
13844,2025-04-08 12:30:51+00:00,,,1ghn3eh6bkldfftvec91yfxmverqzy2pqy,Wintermute,,117.0,9340578.0,,bitcoin,
19703,2025-04-08 12:22:52+00:00,bc1qddvmdc9t8tad2eqd7vfwv2mexjzs49f42wvg27,Wintermute,,,,2.109852,168767.1,,bitcoin,
35917,2025-04-08 12:05:05+00:00,bc1qddvmdc9t8tad2eqd7vfwv2mexjzs49f42wvg27,Wintermute,,,,21.700019,1733072.0,,bitcoin,
48074,2025-04-08 11:54:09+00:00,bc1qddvmdc9t8tad2eqd7vfwv2mexjzs49f42wvg27,Wintermute,,,,21.900019,1742694.0,,bitcoin,
48075,2025-04-08 11:54:09+00:00,bc1qddvmdc9t8tad2eqd7vfwv2mexjzs49f42wvg27,Wintermute,,,,22.10002,1758609.0,,bitcoin,
66070,2025-04-08 11:34:40+00:00,0x76801132a22801640284cd67f7dd41fed2926b6a,Wintermute,0x001b3389c5efb25272e95c27c3d99a2bd9ca9e4c,PancakeSwap,,0.0,0.0,0x0a69eb905923724d37ce6df2a13bb84c283e60ad290c...,bsc,48176753.0
83365,2025-04-08 11:11:36+00:00,bc1qddvmdc9t8tad2eqd7vfwv2mexjzs49f42wvg27,Wintermute,,,,5.475203,432349.4,,bitcoin,
83366,2025-04-08 11:11:36+00:00,bc1qddvmdc9t8tad2eqd7vfwv2mexjzs49f42wvg27,Wintermute,,,,5.780017,456419.0,,bitcoin,


In [111]:
print(df["chain"].value_counts())
print(df["token"].value_counts().head())

chain
solana          294457
arbitrum_one     78600
ethereum         24327
base             21413
bsc               7444
optimism          3391
avalanche            8
polygon              6
flare                2
Name: count, dtype: int64
token
USDC    115678
WSOL    100472
SOL      57838
WETH     54865
USDT     18788
Name: count, dtype: int64


# Save cleaned data

In [112]:
df.to_parquet("../data/processed/wintermute_transfers_clean.parquet", index=False)