# Xblock

# Extract from pkl to csv

https://xblock.pro/#/dataset/13


In [None]:
import pickle
import networkx as nx

path = "./phishing/src/XBlock_13_MulDiGraph.pkl"
with open(path, "rb") as f:
    G = pickle.load(f)          # или nx.read_gpickle(path), если это gpickle

type(G), G.is_directed(), G.is_multigraph()
# (<class 'networkx.classes.multidigraph.MultiDiGraph'>, True, True)

# Примеры атрибутов узлов
n0 = next(iter(G.nodes))
G.nodes[n0]                      # ожидаем {'isp': 0/1 или True/False}

# Примеры атрибутов рёбер
u, v, k, data = next(iter(G.edges(keys=True, data=True)))
(u, v, k, data)                  # ожидаем {'amount': ..., 'timestamp': ...}

# Подсчёт мошеннических адресов
ph_cnt = sum(1 for _, d in G.nodes(data=True) if d.get('isp'))
tot = G.number_of_nodes()
print(ph_cnt, tot)

# Проверить типы полей
types_snapshot = {
    "isp_values": {type(d.get("isp")) for _, d in G.nodes(data=True)},
    "amount_types": {type(d.get("amount")) for _, _, _, d in G.edges(keys=True, data=True)},
    "ts_types": {type(d.get("timestamp")) for _, _, _, d in G.edges(keys=True, data=True)},
}
print(types_snapshot)

1165 2973489
{'isp_values': {<class 'int'>}, 'amount_types': {<class 'float'>}, 'ts_types': {<class 'float'>}}


In [None]:
import pickle
import csv

# Путь к pkl-файлу
pkl_path = "./phishing/src/XBlock_13_MulDiGraph.pkl"
csv_path = "./phishing/processed/XBlock_13.csv"

# Загружаем граф
with open(pkl_path, "rb") as f:
    G = pickle.load(f)

# Записываем уникальные адреса с метками
with open(csv_path, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["address", "is_scam", "description"])
    for node, data in G.nodes(data=True):
        is_scam = int(data.get("isp", 0))  # 1 — мошенник, 0 — нет
        description = "phishing" if is_scam == 1 else ""
        writer.writerow([node, is_scam, description])

print(f"Готово! Файл сохранён в {csv_path}")

Готово! Файл сохранён в XBlock_13.csv


https://xblock.pro/#/dataset/25

In [15]:
import pandas as pd
import re
from pathlib import Path

INPUT_CSV = "./Ponzi/src/Xblock_PonziDataset_20221114.csv"
OUTPUT_CSV = "./Ponzi/processed/Xblock_Ponzi_labels.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адрес
df["address"] = df["address"].str.strip().str.lower()

# фильтруем валидные адреса
df = df[df["address"].str.match(ADDR_RE, na=False)].copy()

# формируем description и is_scam
df["description"] = df["label"].apply(
    lambda x: "ponzi" if x.strip() == "1" else None)
df["is_scam"] = df["label"].apply(lambda x: 1 if x.strip() == "1" else 0)

# оставляем только нужные поля
out = df[["address", "is_scam", "description"]].drop_duplicates(
    subset=["address"]).reset_index(drop=True)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
out.head(10)

Сохранено 6483 строк в ./Ponzi/processed/Xblock_Ponzi_labels.csv


Unnamed: 0,address,is_scam,description
0,0x6e38a457c722c6011b2dfa06d49240e797844d66,0,
1,0x109c4f2ccc82c4d77bde15f306707320294aea3f,1,ponzi
2,0x793ae8c1b1a160bfc07bfb0d04f85eab1a71f4f2,0,
3,0x5fe5b7546d1628f7348b023a0393de1fc825a4fd,0,
4,0xd79b4c6791784184e2755b2fc1659eaab0f80456,0,
5,0x273930d21e01ee25e4c219b63259d214872220a2,0,
6,0xd07ce4329b27eb8896c51458468d98a0e4c0394c,0,
7,0x8017f24a47c889b1ee80501ff84beb3c017edf0b,0,
8,0xcca8353a18e7ab7b3d094ee1f9ddc91bdf2ca6a4,0,
9,0x07307d0b136a79bac718f43388aed706389c4588,0,


https://xblock.pro/#/dataset/50

In [18]:
import re
import numpy as np
import pandas as pd
from pathlib import Path

# === INPUTS ===
XLS_FLAGS_PATH = r"./rugpull/src/Xblock50_groundTruth.xlsx"  # адрес + Mint/Leak/Limit
MINT_PATH = r"./rugpull/src/Xblock50_mint.xlsx"         # Address, TP?
LEAK_PATH = r"./rugpull/src/Xblock50_leak.xlsx"         # Address, TP?
LIMIT_PATH = r"./rugpull/src/Xblock50_limit.xlsx"        # Address, TP?

# === OUTPUT ===
OUTPUT_CSV = r"./rugpull/processed/Xblock50_groundTruth_merged.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# ---------- 1) groundTruth: Mint/Leak/Limit -> rugpull ----------
df1 = pd.read_excel(XLS_FLAGS_PATH, dtype=str).fillna("")
df1.columns = df1.columns.str.strip()

# нормализуем адрес и фильтруем валидные
df1["address"] = df1["address"].str.strip().str.lower()
df1 = df1[df1["address"].str.match(ADDR_RE, na=False)].copy()

# приведём флаги и посчитаем "хотя бы одна 1"
for col in ["Mint", "Leak", "Limit"]:
    if col in df1.columns:
        df1[col] = pd.to_numeric(
            df1[col], errors="coerce").fillna(0).astype(int)
    else:
        df1[col] = 0

rugpull1 = (df1[["Mint", "Leak", "Limit"]].sum(axis=1) > 0)

out1 = pd.DataFrame({
    "address": df1["address"],
    "is_scam": rugpull1.astype(int),
    "description": np.where(rugpull1, "rugpull", None),
})

# ---------- 2) helper для файлов с TP? ----------


def load_tp_yes(path: str) -> pd.DataFrame:
    df = pd.read_excel(path, dtype=str).fillna("")
    df.columns = df.columns.str.strip()

    # находим колонки Address / TP?
    addr_col = next(
        (c for c in df.columns if c.strip().lower() == "address"), None)
    tp_col = next((c for c in df.columns if c.strip().lower()
                  in ("tp?", "tp", "label")), None)
    if addr_col is None or tp_col is None:
        raise ValueError(f"Не найдены нужные колонки в файле: {path}")

    # нормализуем и фильтруем
    df[addr_col] = df[addr_col].str.strip().str.lower()
    df = df[df[addr_col].str.match(ADDR_RE, na=False)].copy()

    # берём только Yes как rugpull
    is_yes = df[tp_col].astype(str).str.strip().str.lower().eq("yes")
    df_yes = df[is_yes].copy()

    return pd.DataFrame({
        "address": df_yes[addr_col],
        "is_scam": 1,
        "description": "rugpull",
    })


out2 = load_tp_yes(MINT_PATH)
out3 = load_tp_yes(LEAK_PATH)
out4 = load_tp_yes(LIMIT_PATH)

# ---------- 3) объединяем с приоритетом rugpull ----------
merged = pd.concat([out1, out2, out3, out4], ignore_index=True)

# если адрес встречается несколько раз: is_scam = max; если где-то rugpull — description = "rugpull"
merged = (
    merged.groupby("address", as_index=False)
          .agg(is_scam=("is_scam", "max"),
               description=("description", lambda s: "rugpull" if (s == "rugpull").any() else None))
)

# ---------- 4) сохраняем ----------
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
merged.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(merged)} строк в {OUTPUT_CSV}")
merged.head(10)

Сохранено 294 строк в ./rugpull/processed/Xblock50_groundTruth_merged.csv


Unnamed: 0,address,is_scam,description
0,0x0290ea3c728981725689187763f6c63a68e192b8,1,rugpull
1,0x02d3aea48b443a0026ed9cbc91b97d7335aba323,1,rugpull
2,0x03260e1b0f53e1a1f93cf126a7ca42a1c71648d6,1,rugpull
3,0x0414d8c87b271266a5864329fb4932bbe19c0c49,1,rugpull
4,0x054ad3cd4a66f14bf5c0de2548a53be66995a4f6,1,rugpull
5,0x08769a9b479a4b20e796194d960cc407fc66359a,1,rugpull
6,0x0886949c1b8c412860c4264ceb8083d1365e86cf,1,rugpull
7,0x0b54c811cd4fd4d27c2ff0061c3831427d89b73c,1,rugpull
8,0x0c29d192dd5443fb5f00e5fe52f61d793025643b,1,rugpull
9,0x0c39b1bec04421c6cad1df8a50701a56880f0362,1,rugpull


# Github PTXPhish

https://github.com/blocksecteam/PTXPhish/tree/main?tab=readme-ov-file

In [None]:
from pathlib import Path
import pandas as pd
import re
# путь к исходному Excel
INPUT_XLSX = "./phishing/src/blocksecteam_PTXPhish_InitialAddress.xlsx"
OUTPUT_CSV = "./phishing/processed/github_PTXPhish.csv"   # куда сохранить CSV


ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")


def looks_like_tx_col(name: str) -> bool:
    if name is None:
        return False
    s = str(name).strip().lower()
    return (
        s.startswith("tx") or
        s in {"tx_total", "tx total", "tx_tatal", "txcount", "tx_total_count"}
    )


def looks_like_type_col(name: str) -> bool:
    if name is None:
        return False
    s = str(name).strip().lower()
    return s.startswith("type") or s in {"types", "type_id", "attack_type"}


def harvest_addresses(series: pd.Series):
    out = []
    for val in series.dropna().astype(str):
        v = val.strip()
        if ADDR_RE.match(v):
            out.append(v.lower())
    return out


def parse_sheet(df: pd.DataFrame):
    cols = list(df.columns)
    i = 0
    recs = []
    while i < len(cols):
        cat_col = cols[i]
        tx_col = cols[i+1] if i+1 < len(cols) else None
        typ_col = cols[i+2] if i+2 < len(cols) else None

        # Паттерн колонок-триплетов: [Category | Tx_total | Type]
        if tx_col is not None and typ_col is not None and looks_like_tx_col(tx_col) and looks_like_type_col(typ_col):
            category = str(cat_col).strip()
            for a in harvest_addresses(df[cat_col]):
                recs.append({"address": a, "is_scam": 1,
                            "description": category})
            i += 3
        else:
            # Неузнанная колонка — на всякий случай соберём адреса без категории
            for a in harvest_addresses(df[cat_col]):
                recs.append({"address": a, "is_scam": 1, "description": ""})
            i += 1
    return recs


# читаем все листы
sheets = pd.read_excel(INPUT_XLSX, sheet_name=None)
all_recs = []
for name, df in sheets.items():
    if isinstance(df, pd.DataFrame) and not df.empty:
        all_recs.extend(parse_sheet(df))

# если ничего не нашлось — сохраняем пустую таблицу нужной формы
if not all_recs:
    out_df = pd.DataFrame(columns=["address", "is_scam", "description"])
    out_df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved 0 rows to {OUTPUT_CSV}")
else:
    out_df = pd.DataFrame(all_recs).drop_duplicates(
        subset=["address", "description"])
    # объединяем несколько категорий в одну строку по адресу
    out_df["description"] = out_df["description"].fillna("").astype(str)
    agg = (
        out_df.groupby("address", as_index=False)
              .agg({"description": lambda s: ";".join(sorted({d for d in s if d.strip()}))})
    )
    agg["is_scam"] = 1
    agg = agg[["address", "is_scam", "description"]]

    Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
    agg.to_csv(OUTPUT_CSV, index=False)
    display(agg.head(20))
    print(f"Saved {len(agg):,} rows to {OUTPUT_CSV}")

Unnamed: 0,address,is_scam,description
0,0x00000000fffb6508e8c3db931fefc3895cf36d15,1,Approve;permit
1,0x00000002a90b740bd0d3b24277ca2c6236b11316,1,Airdrop function
2,0x000006d683610e61ad9ee4b487fae3904b392b9e,1,Approve;Free buy order;permit;setApproveForAll
3,0x00000a8a27646e939716f7ee48e64d532a000000,1,Airdrop function
4,0x00000ff01f359d7cde7d1bb45ced9459a6a80000,1,Airdrop function
5,0x000011387eb24f199e875b1325e4805efd3b0000,1,Airdrop function
6,0x00001519230c3bdf39fe8d1678454dac935c0000,1,Approve;permit;setApproveForAll
7,0x00001f78189be22c3498cff1b8e02272c3220000,1,Approve;Free buy order;Proxy upgrade;permit;se...
8,0x00002b3becfc9499f271aa81201138afefe6bb00,1,Approve;permit
9,0x00002d618fcb99cfe7af0c6505508b33fc620000,1,Airdrop function


Saved 228 rows to github_PTXPhish.csv


# Github phishing_contract_sigmetrics25

https://github.com/blocksecteam/phishing_contract_sigmetrics25/tree/main

In [None]:
from pathlib import Path
import pandas as pd
import re
INPUT_LOG = "./phishing/src/github_phishing_contracts.log"
OUTPUT_CSV = "./phishing/processed/github_phishing_contracts_addresses.csv"


ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

addrs = []
with open(INPUT_LOG, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("#") or line.startswith("//"):
            continue
        # Берём первое "слово" в строке, если там могут быть комментарии после адреса
        token = line.split()[0]
        if ADDR_RE.match(token):
            addrs.append(token.lower())

# Уникализируем
addrs = sorted(set(addrs))

df = pd.DataFrame({
    "address": addrs,
    "is_scam": 1,
    "description": "phishing"
})

Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_CSV, index=False)

print(f"Saved {len(df):,} rows -> {OUTPUT_CSV}")
display(df.head(10))

Saved 37,654 rows -> ./processed/github_phishing_contracts_addresses.csv


Unnamed: 0,address,is_scam,description
0,0x000000000000232e765d898b5f1b189b5c993f05,1,phishing
1,0x0000000000958ec6667eb97e2f64f6909073e6b7,1,phishing
2,0x0000000000d4f82814c07974ee1c0ea4b7632c7e,1,phishing
3,0x00000002a90b740bd0d3b24277ca2c6236b11316,1,phishing
4,0x00000004c4ac9089e5a34c5c4dd1adc3b2304248,1,phishing
5,0x000000086f86a665381d1b22b6d93695752daad2,1,phishing
6,0x0000000c670a674d9dbb1ee38713eb215215a9fe,1,phishing
7,0x0000003e829c180ee307664a2bd69292b5ed0000,1,phishing
8,0x000000732876cec0b7159203d71445311f0a0000,1,phishing
9,0x000000733fedfa69689b34ec2c1883d9a0d90000,1,phishing


# Verified smart contracts EtherScan

https://etherscan.io/exportData?type=open-source-contract-codes

In [4]:
import pandas as pd

INPUT_CSV = "./verified/src/Etherscan_export-verified-contractaddress-opensource-license.csv"
OUTPUT_CSV = "./verified/processed/Etherscan_verified_contracts_addresses.csv"

# Читаем, пропуская первую строку с Note:
df = pd.read_csv(INPUT_CSV, skiprows=1)

# Убираем кавычки из названий столбцов
df.columns = [c.strip('"') for c in df.columns]

# Берем только нужные колонки
df_out = pd.DataFrame({
    "address": df["ContractAddress"].str.lower(),
    "is_scam": 0,
    "description": df["ContractName"].astype(str) + ";verified"
})

# Сохраняем
df_out.to_csv(OUTPUT_CSV, index=False)
print(f"Сохранено {len(df_out)} строк в {OUTPUT_CSV}")
df_out.head()

Сохранено 5000 строк в ./verified/processed/Etherscan_verified_contracts_addresses.csv


Unnamed: 0,address,is_scam,description
0,0xf56f4d867a8f447ddb1d4d7559dc110fa75b3051,0,PUFT;verified
1,0x5a6263a793192872013efc62c85da78efc53f143,0,BitMineEth;verified
2,0x9f4fb9b16d5af482973562a33ecd781d841fe289,0,CryptoHaiku7;verified
3,0x9cfd8d110e7c736800628dc535803132f472bcab,0,GAMAKO;verified
4,0x85b6571edcc7842b54d3ee8d92ad289ae82ff128,0,TOKEN;verified


# Different Types of Fraudulent Accounts

https://github.com/MyEtherWallet/ethereum-lists


In [5]:
import pandas as pd
import json

# Файлы
INPUT_JSON = "./other/src/MyEtherWallet_addresses-darklist.json"
OUTPUT_CSV = "./other/processed/MyEtherWallet_scam_addresses.csv"

# Загружаем JSON
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

# Преобразуем в DataFrame
df = pd.DataFrame(data)

# Приводим к нужному формату
df_out = pd.DataFrame({
    "address": df["address"].str.lower(),
    "is_scam": 1,
    "description": df["comment"]
})

# Сохраняем
df_out.to_csv(OUTPUT_CSV, index=False)
print(f"Сохранено {len(df_out)} строк в {OUTPUT_CSV}")
df_out.head()

Сохранено 715 строк в ./other/processed/MyEtherWallet_scam_addresses.csv


Unnamed: 0,address,is_scam,description
0,0x09750ad360fdb7a2ee23669c4503c974d86d8694,1,XRP phishing website (ripple.com.pt) this wall...
1,0xc915ec7f4cfd1c0a8aba090f03bfaab588aef9b4,1,XRP phishing website (ripple.com.pt) that got ...
2,0xecb6ffac05d8b4660b99b475b359fe454c77d153,1,XRP phishing website (ripple.com.pt) that got ...
3,0x7f85a82a2da50540412f6e526f1d00a0690a77b8,1,XRP phishing website (ripple.com.pt) that got ...
4,0xbc8b85b1515e45fb2d74333310a1d37b879732c0,1,XRP phishing website (ripple.com.pt) that got ...


In [6]:
import pandas as pd
import json

# Файлы
INPUT_JSON = "./verified/src/MyEtherWallet_addresses-lightlist.json"
OUTPUT_CSV = "./verified/processed/MyEtherWallet_verified_addresses.csv"

# Загружаем JSON
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

# Преобразуем в DataFrame
df = pd.DataFrame(data)

# Приводим к нужному формату
df_out = pd.DataFrame({
    "address": df["address"].str.lower(),
    "is_scam": 0,
    "description": df["comment"] + "; verified"
})

# Сохраняем
df_out.to_csv(OUTPUT_CSV, index=False)
print(f"Сохранено {len(df_out)} строк в {OUTPUT_CSV}")
df_out.head()

Сохранено 2 строк в ./verified/processed/MyEtherWallet_verified_addresses.csv


Unnamed: 0,address,is_scam,description
0,0xf8094e15c897518b5ac5287d7070ca5850efc6ff,0,district0x Address; verified
1,0x439b54caf661c21e6b231d972d7eaa98f199590f,0,DataBroker DAO Address; verified


# EtherScamDB

https://github.com/MrLuit/EtherScamDB/tree/master

In [9]:
# === CONFIG ===
from pathlib import Path
import pandas as pd
import yaml
import re
INPUT_YAML = "./other/src/EtherScamDB_scams.yaml"
OUTPUT_CSV = "./other/processed/EtherScamDB_scam_addresses.csv"

# === CODE ===

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# загружаем yaml (список словарей)
with open(INPUT_YAML, "r", encoding="utf-8") as f:
    data = yaml.safe_load(f)

records = []

for item in data:
    addrs = item.get("addresses")
    if not addrs:
        continue  # пропускаем записи без addresses

    category = (item.get("category") or "").strip()

    if category:
        desc = f"{category}"
    else:
        desc = None   # запасной вариант

    # перебираем адреса, валидируем и нормализуем
    for a in addrs:
        a = str(a).strip()
        if ADDR_RE.match(a):
            records.append({
                "address": a.lower(),
                "is_scam": 1,
                "description": desc
            })

# в датафрейм, убираем дубликаты по (address, description)
df = pd.DataFrame(records).drop_duplicates(
    subset=["address", "description"]).reset_index(drop=True)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_CSV, index=False)

print(f"Saved {len(df)} rows -> {OUTPUT_CSV}")
display(df.head(20))

Saved 2060 rows -> ./other/processed/EtherScamDB_scam_addresses.csv


Unnamed: 0,address,is_scam,description
0,0xd0cc2b24980cbcca47ef755da88b220a82291407,1,Phishing
1,0x4cdc1cba0aeb5539f2e0ba158281e67e0e54a9b1,1,Phishing
2,0x00e01a648ff41346cdeb873182383333d2184dd1,1,Phishing
3,0x858457daa7e087ad74cdeeceab8419079bc2ca03,1,Phishing
4,0x240e125c20a4cc84bd6e7f8d1fd07aff4c06d43d,1,Fake ICO
5,0x2268751eafc860781074d25f4bd10ded480310b9,1,Fake ICO
6,0xadf5b0c2103598fb66a61714152f1d1717d49fe0,1,Fake ICO
7,0x379ce20c018fb6301c1872c429ec7270ffa4dc5b,1,Phishing
8,0x2b065809f6ec6df32878bcd26711a0e2bcf59c26,1,Phishing
9,0x42c5459911ae51d1d005cbe39749bd8d8e533c22,1,Phishing


# CryptoScamDB

https://github.com/CryptoScamDB/blacklist

In [12]:
import yaml
import pandas as pd

INPUT_YAML = "./other/src/CryptoScamDB_urls.yaml"
OUTPUT_CSV = "./other/processed/CryptoScamDB_fraud_addresses.csv"

with open(INPUT_YAML, "r", encoding="utf-8") as f:
    data = yaml.safe_load(f)

rows = []
for item in data:
    if "addresses" in item and isinstance(item["addresses"], dict):
        eth_addrs = item["addresses"].get("ETH", [])
        for addr in eth_addrs:
            rows.append({
                "address": addr,
                "is_scam": 1,
                "description": item.get("category") or None  # null если пусто
            })

df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV, index=False)

print(f"Saved {len(df)} rows -> {OUTPUT_CSV}")
display(df.head(20))

Saved 4004 rows -> ./other/processed/CryptoScamDB_fraud_addresses.csv


Unnamed: 0,address,is_scam,description
0,0xD0cC2B24980CBCCA47EF755Da88B220a82291407,1,Phishing
1,0x4cdc1cba0aeb5539f2e0ba158281e67e0e54a9b1,1,Phishing
2,0x00e01A648Ff41346CDeB873182383333D2184dd1,1,Phishing
3,0x858457daa7e087ad74cdeeceab8419079bc2ca03,1,Phishing
4,0x240e125c20a4cC84Bd6E7F8D1FD07Aff4c06D43d,1,Phishing
5,0x2268751eAFC860781074D25f4bD10DED480310B9,1,Phishing
6,0xadF5b0c2103598fb66a61714152F1d1717D49FE0,1,Phishing
7,0x379ce20c018fb6301c1872c429ec7270ffa4dc5b,1,Phishing
8,0x2b065809f6ec6df32878bcd26711a0e2bcf59c26,1,Phishing
9,0x42c5459911ae51d1d005cbe39749bd8d8e533c22,1,Phishing


In [14]:
import yaml
import pandas as pd

INPUT_YAML = "./verified/src/CryptoScamDB_urls_white.yaml"
OUTPUT_CSV = "./verified/processed/CryptoScamDB_white_addresses.csv"

with open(INPUT_YAML, "r", encoding="utf-8") as f:
    data = yaml.safe_load(f)

rows = []
for item in data:
    if "addresses" in item and isinstance(item["addresses"], dict):
        eth_addrs = item["addresses"].get("ETH", [])
        for addr in eth_addrs:
            rows.append({
                "address": addr,
                "is_scam": 0,
                "description": f"{item.get('name', '')};verified"
            })

df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV, index=False)

print(f"Saved {len(df)} rows -> {OUTPUT_CSV}")
df.head()

Saved 7 rows -> ./verified/processed/CryptoScamDB_white_addresses.csv


Unnamed: 0,address,is_scam,description
0,0x4bbeEB066eD09B7AEd07bF39EEe0460DFa261520,0,MyCrypto;verified
1,0x71c7656ec7ab88b098defb751b7401b5f6d8976f,0,Etherscan;verified
2,0xdecaf9cd2367cdbb726e904cd6397edfcae6068d,0,MyEtherWallet;verified
3,0x111111111117dc0aa78b770fa6a738034120c302,0,1Inch;verified
4,0x11111254369792b2ca5d084ab5eea397ca8fa48b,0,1Inch;verified


# ScamSniffer

https://github.com/scamsniffer/scam-database

In [19]:
import csv
import json
import re

INPUT_JSON = "./phishing/src/ScamSniffer_all.json"
OUTPUT_CSV = "./phishing/processed/ScamSniffer_scams.csv"

# Загружаем JSON
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

addresses = data.get("address", [])

# Только валидные Ethereum-адреса
eth_pattern = re.compile(r"^0x[a-fA-F0-9]{40}$")
eth_addresses = [addr for addr in addresses if isinstance(
    addr, str) and eth_pattern.match(addr)]

total = len(addresses)
valid = len(eth_addresses)

print(f"Всего записей в JSON: {total}")
print(f"Валидных Ethereum-адресов: {valid}")
print("\n=== HEAD (первые 10 строк) ===")
for addr in eth_addresses[:10]:
    print(addr, 1, "phishing")

# Запись в CSV
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["address", "is_scam", "description"])
    for addr in eth_addresses:
        writer.writerow([addr, 1, "phishing"])

print(f"\nСохранено {valid} адресов в {OUTPUT_CSV}")

Всего записей в JSON: 4607
Валидных Ethereum-адресов: 4598

=== HEAD (первые 10 строк) ===
0x7538fd1e30d8e7771105d470fe8d65b6ab0da93f 1 phishing
0xe6b39db1dc73f0e591eaeddda1f8798dabf836d0 1 phishing
0x1a633538b169b41052bfc40b0c973ac1bff31a4e 1 phishing
0x2d12df4474f9ec9e0aee2b4748c9a6e57f5cf8d0 1 phishing
0xaca0ae45b4643847c19776e1e02612e494cb13f5 1 phishing
0x043ad0e52a41322c61fc4d880abf1f4511388aa2 1 phishing
0x5faf3e692e3a4cd3a329ca5f628f6d509e1412dc 1 phishing
0x98cbe29970b5b2d2b6924df9a243578875c1ee86 1 phishing
0x3077703aa475df9914717fe7dbabfaa9e222c716 1 phishing
0x3226f6b6220d14171cf1b664f4581ceec5708131 1 phishing

Сохранено 4598 адресов в ./phishing/processed/ScamSniffer_scams.csv


# Forta Network

https://github.com/forta-network/labelled-datasets

In [25]:
import csv

INPUT_CSV = "./phishing/src/Forta_phishing_scams.csv"
OUTPUT_CSV = "./phishing/processed/Forta_phishing.csv"

eth_addresses = []

with open(INPUT_CSV, "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        addr = row["address"].strip()
        tag = row["etherscan_tag"].strip() if row["etherscan_tag"] else ""
        label = row["etherscan_labels"].strip(
        ) if row["etherscan_labels"] else ""

        parts = [p for p in [tag, label] if p]
        desc = "; ".join(parts) if parts else "phishing"

        eth_addresses.append((addr, 1, desc))

with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["address", "is_scam", "description"])
    writer.writerows(eth_addresses)

print(f"Сохранено {len(eth_addresses)} адресов в {OUTPUT_CSV}")

# Читаем и выводим head
df = pd.read_csv(OUTPUT_CSV)
display(df.head(10))

Сохранено 6726 адресов в ./phishing/processed/Forta_phishing.csv


Unnamed: 0,address,is_scam,description
0,0x000000000532b45f47779fce440748893b257865,1,Fake_Phishing3901; phish-hack
1,0x00000000072d54638c2c2a3da3f715360269eea1,1,Fake_Phishing4939; phish-hack
2,0x0000000009324b6434d7766af41908e4c49ee1d7,1,Fake_Phishing3857; phish-hack
3,0x0000000086c5d614bec59dfd2c9b88f7cb57f23c,1,Fake_Phishing4604; phish-hack
4,0x00000000bf02300fd6251627aa3db8933a0eee83,1,Fake_Phishing3856; phish-hack
5,0x00000000d029a3ed50d891b9afd74b034179082e,1,Fake_Phishing6044; phish-hack
6,0x00000006e55a9364b657e3b91cd0411b4fd11ac2,1,Adidas Originals Metaverse (ADIDAS); phish-hack
7,0x00000a9F233D0b3EBBF136276165429d071D1aBf,1,Wrapped Ether (Gas Optimized) (WETH); phish-hack
8,0x00000e32e51011e28958d4696627c82c3dacd5a6,1,Fake_Phishing3541; phish-hack
9,0x000014688fd28b29b761cc1ec77b532bc923d400,1,Metaverse DAO Token (META); phish-hack


In [None]:
import re
import pandas as pd
from pathlib import Path

INPUT_CSV = "./other/src/Forta_malicious_smart_contracts.csv"
OUTPUT_CSV = "./other/processed/Forta_smart_contracts.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# читаем
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адреса и отфильтровываем только валидные ETH-адреса
df["contract_address"] = df["contract_address"].str.strip().str.lower()
df = df[df["contract_address"].str.match(ADDR_RE, na=False)].copy()

# соберём описание из нескольких полей


def make_desc(row):
    parts = [
        row.get("contract_tag", "").strip(),
        row.get("contract_creator_tag", "").strip(),
        row.get("contract_creator_etherscan_label", "").strip(),
        row.get("notes", "").strip(),
    ]
    # убираем пустые и дубликаты, склеиваем через '; '
    parts = [p for p in parts if p]
    # если ничего нет — поставим общий ярлык
    return "; ".join(dict.fromkeys(parts)) if parts else "phishing"


desc = df.apply(make_desc, axis=1)

# формируем итог
out = pd.DataFrame({
    "address": df["contract_address"],
    "is_scam": 1,
    "description": desc
})

# убираем дубликаты адресов (если нужно — по адресу, оставляя первое описание)
out = out.drop_duplicates(subset=["address"]).reset_index(drop=True)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
display(out.head(10))

Сохранено 719 строк в ./other/processed/Forta_smart_contracts.csv


Unnamed: 0,address,is_scam,description
0,0x04ae3226c80e8c04d35e6e56089345bdd06da6de,1,Multichain Exploiter 10; exploit
1,0xc5ac25cfc2b8284e84ca47dad21cf1319f732c11,1,heist
2,0x79dbe9bbde91a35fa8148a14084979a531fe57ea,1,heist
3,0x3b1ea5b11d12452693f9bd290ac2100394e6850f,1,heist
4,0x682dcf2f4a6e46c222927a54529b4965fb313bf2,1,heist
5,0xbc48cd3265fd6d6cd413cd3e7082c27993baf8b2,1,heist
6,0x62494b3ed9663334e57f23532155ea0575c487c5,1,heist
7,0x164c2b90f83b67d897ff00899695430841e38536,1,MultiSig Exploit 4
8,0x1bcf9edb72f7650dfcdc59ae3b8a73d35a2f2902,1,ChainPort Exploiter 2; heist
9,0x259a2795624b8a17bc7eb312a94504ad0f615d1e,1,exploit


In [31]:
import re
import pandas as pd
from pathlib import Path

INPUT_CSV = "./other/src/Forta_etherscan_malicious_labels.csv"
OUTPUT_CSV = "./other/processed/Forta_scam_labels.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# читаем исходник
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адреса
df["banned_address"] = df["banned_address"].str.strip().str.lower()

# фильтруем валидные eth-адреса
df = df[df["banned_address"].str.match(ADDR_RE, na=False)].copy()

# description = только wallet_tag (если пусто — phishing)
df["description"] = df["wallet_tag"].str.strip().replace("", "phishing")

# формируем итоговую таблицу
out = (
    df.rename(columns={"banned_address": "address"})[
        ["address", "description"]]
    .drop_duplicates(subset=["address"])
    .assign(is_scam=1)[["address", "is_scam", "description"]]
    .reset_index(drop=True)
)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
out.head(10)

Сохранено 7259 строк в ./other/processed/Forta_scam_labels.csv


Unnamed: 0,address,is_scam,description
0,0x80d7bb18521acbef32d7906502ebe94928690e93,1,Fake_Phishing1772
1,0x80e02ae8c5d5e482558caf32004e7d6281445f28,1,Fake_Phishing1863
2,0x80e23a71343df6320824788b0466d05fcd679da4,1,Fake_Phishing3640
3,0x80e313e41301312cbcda2fd38aa6142af637f8cb,1,Fake_Phishing4450
4,0x80e8374a89c514340a7886defabfc338dff34d86,1,Fake_Phishing4567
5,0x80f43e67169c917c68391f823fcff41b9b786d69,1,Fake_Phishing3032
6,0x80ff1325226351c65c15b6eaa40c46122203a28f,1,Fake_Phishing970
7,0x81053c561fd1e5305d05c651fcc2ae7c4ae2cde6,1,Fake_Phishing1297
8,0x8105ed8a3be363e32c91a0904cd59e0f843d7123,1,Fake_Phishing3993
9,0x8106d6eafdf51f5148026fea6c26be371bde5e52,1,Fake_Phishing5225


# Kaggle Hamish Hall
https://www.kaggle.com/datasets/hamishhall/labelled-ethereum-addresses?select=eth_addresses.csv

In [33]:
import re
import pandas as pd
from pathlib import Path

INPUT_CSV = "./other/src/Hamish_Hall_eth_addresses.csv"
OUTPUT_CSV = "./other/processed/Hamish_Hall_legit_and_dodgy.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# читаем исходник
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адреса
df["Address"] = df["Address"].str.strip().str.lower()

# фильтруем валидные eth-адреса
df = df[df["Address"].str.match(ADDR_RE, na=False)].copy()

# формируем описание: Name + Tags (если нужны только имена — убрать tags)


def make_desc(row):
    parts = [row.get("Name", "").strip()]
    # добавим Tags, если они есть
    tags = [str(t).strip() for t in row[6:] if str(t).strip()]
    if tags:
        parts.extend(tags)
    return "; ".join([p for p in parts if p])


df["description"] = df.apply(make_desc, axis=1)

# определяем is_scam
df["is_scam"] = df["Label"].str.strip().str.lower().map({
    "legit": 0,
    "dodgy": 1
}).fillna(1).astype(int)

# итоговая таблица
out = (
    df.rename(columns={"Address": "address"})[
        ["address", "is_scam", "description"]]
    .drop_duplicates(subset=["address"])
    .reset_index(drop=True)
)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
out.head(10)

Сохранено 19111 строк в ./other/processed/Hamish_Hall_legit_and_dodgy.csv


Unnamed: 0,address,is_scam,description
0,0x8ab7404063ec4dbcfd4598215992dc3f8ec853d7,0,Akropolis (AKRO); DeFi; Token Contract
1,0x1c74cff0376fb4031cd7492cd6db2d66c3f2c6b9,0,bZx Protocol Token (BZRX); Token Contract; bZx...
2,0x06af07097c9eeb7fd685c692751d5c66db49c215,0,Chai (CHAI); DeFi; Token Contract; Verified Co...
3,0xc00e94cb662c3520282e6f5717214004a7f26888,0,Compound (COMP); DAO; Compound; Token Contract
4,0xb3319f5d18bc0d84dd1b4825dcde5d5f7266d407,0,Compound 0x (cZRX); Token Contract; DeFi; Comp...
5,0x158079ee67fce2f58472a96584a73c7ab9ac95c1,0,Compound Augur (cREP); DeFi; Compound; Token C...
6,0x6c8c6b02e7b2be14d4fa6022dfd6d75921d90e4e,0,Compound Basic Attention Token (cBAT); DeFi; C...
7,0x5d3a536e4d6dbd6114cc1ead35777bab948e3643,0,Compound Dai (cDAI); DeFi; Compound; Token Con...
8,0x4ddc2d193948926d02f9b1fe9e1daa0718270ed5,0,Compound Ether (cETH); DeFi; Compound; Token C...
9,0xf5dce57282a584d2746faf1593d3121fcac444dc,0,Compound Sai (cSAI); DeFi; Compound; Token Con...


# Kaggle Vagifa
https://www.kaggle.com/datasets/vagifa/ethereum-frauddetection-dataset/data

In [1]:
import re
import pandas as pd
from pathlib import Path

INPUT_CSV = "./other/src/vagifa_transaction_dataset.csv"
OUTPUT_CSV = "./other/processed/vagifa_transactions.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# читаем как строки (в файле много лишних колонок)
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адреса
df["Address"] = df["Address"].str.strip().str.lower()

# фильтруем валидные ETH-адреса
df = df[df["Address"].str.match(ADDR_RE, na=False)].copy()

# is_scam из FLAG (1 -> 1, всё остальное -> 0)
df["is_scam"] = (df["FLAG"].astype(str).str.strip() == "1").astype(int)

# description = NULL (пустая ячейка)
df["description"] = None

# итоговая таблица и удаление дублей по адресу (оставляем первый встретившийся is_scam)
out = (
    df[["Address", "is_scam", "description"]]
    .drop_duplicates(subset=["Address"])
    .rename(columns={"Address": "address"})
    .reset_index(drop=True)
)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
out.head(10)

Сохранено 9811 строк в ./other/processed/vagifa_transactions.csv


Unnamed: 0,address,is_scam,description
0,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,
1,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,
2,0x0002bda54cb772d040f779e88eb453cac0daa244,0,
3,0x00038e6ba2fd5c09aedb96697c8d7b8fa6632e5e,0,
4,0x00062d1dd1afb6fb02540ddad9cdebfe568e0d89,0,
5,0x000895ad78f4403ecd9468900e68d6ee506136fd,0,
6,0x000d63fc5df52b0204374c2f5a3249779805d5d1,0,
7,0x000e001ab444fa8d6dc4a402f8d7cfc88fe8c64d,0,
8,0x0012cb699c836049a4bbeaac2d8c4d47c688e0e4,0,
9,0x0012f247c9f980eea0a9ad06893bfd95c3145794,0,


# surajsjain github

https://github.com/surajsjain/ethereum-fraud-datasets/tree/main

In [2]:
import re
import pandas as pd
from pathlib import Path

# === CONFIG ===
INPUT_CSV = "./other/src/surajsjain_All_contract_reports.csv"
OUTPUT_CSV = "./other/processed/surajsjain_All_contract_reports.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# читаем CSV как строки
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адрес
df["address"] = df["address"].str.strip().str.lower()

# фильтруем валидные ETH-адреса
df = df[df["address"].str.match(ADDR_RE, na=False)].copy()

# description = только name_tag (если пусто — None)
df["description"] = df["name_tag"].str.strip()
df["description"] = df["description"].replace({"": None})

# все такие адреса считаем scam
df["is_scam"] = 1

# финальный набор
out = (
    df[["address", "is_scam", "description"]]
    .drop_duplicates(subset=["address"])
    .reset_index(drop=True)
)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
out.head(10)

Сохранено 631 строк в ./other/processed/surajsjain_All_contract_reports.csv


Unnamed: 0,address,is_scam,description
0,0x4639cd8cd52ec1cf2e496a606ce28d8afb1c792f,1,CBDAO: BREE Token
1,0x64f0d720ce8b97ba44cd002005d2dfa3186c0580,1,Exit Scam YYFI Token
2,0x57b818a1070373e21fcedf48d4368e1703c75852,1,Fake: Hero Token Sale 2
3,0x6eff949d9a4e7725d3ee3b894ceaa2813b7437e5,1,Fake: MARL Token
4,0x3018a3eab048a0a1c15003b383a181f089a039cc,1,FAKE: Status.im 3
5,0x2dc5616ec2b9d24906f0e11dc8d8a736392dfedd,1,Fake_Phishing1047
6,0xcc7ecc050aab5843c79f14933e207f032bc16b78,1,Fake_Phishing1059
7,0x34bb6cf517abb83bfd8bc4e41ed899810ac49b58,1,Fake_Phishing1064
8,0xe87d279d090df4ba19c0e19958d9ac61be5c6909,1,Fake_Phishing1081
9,0x8a0e85901849b40a7f80399a00f7115193c0bdb2,1,Fake_Phishing1112


In [3]:
import re
import pandas as pd
from pathlib import Path

INPUT_CSV = "./other/src/surajsjain_Eth_labeled_addresses.csv"
OUTPUT_CSV = "./other/processed/surajsjain_labeled_addresses.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# читаем CSV как строки
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адрес
df["Address"] = df["Address"].str.strip().str.lower()

# фильтруем валидные ETH-адреса
df = df[df["Address"].str.match(ADDR_RE, na=False)].copy()

# Определяем is_scam
df["is_scam"] = df["Label"].str.strip().str.lower().eq("dodgy").astype(int)

# Собираем description = Name + теги (из всех столбцов после "Tags")
tag_cols = [c for c in df.columns if c.startswith(
    "Tags") or c == "Tags" or c == ""]  # на случай пустых заголовков


def make_desc(row):
    parts = [row.get("Name", "").strip()]
    for col in tag_cols:
        val = str(row.get(col, "")).strip()
        if val:
            parts.append(val)
    parts = [p for p in parts if p]
    return "; ".join(parts) if parts else None


df["description"] = df.apply(make_desc, axis=1)

# финальный набор
out = (
    df.rename(columns={"Address": "address"})[
        ["address", "is_scam", "description"]]
    .drop_duplicates(subset=["address"])
    .reset_index(drop=True)
)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
out.head(10)

Сохранено 19111 строк в ./other/processed/surajsjain_labeled_addresses.csv


Unnamed: 0,address,is_scam,description
0,0x8ab7404063ec4dbcfd4598215992dc3f8ec853d7,0,Akropolis (AKRO); DeFi
1,0x1c74cff0376fb4031cd7492cd6db2d66c3f2c6b9,0,bZx Protocol Token (BZRX); Token Contract
2,0x06af07097c9eeb7fd685c692751d5c66db49c215,0,Chai (CHAI); DeFi
3,0xc00e94cb662c3520282e6f5717214004a7f26888,0,Compound (COMP); DAO
4,0xb3319f5d18bc0d84dd1b4825dcde5d5f7266d407,0,Compound 0x (cZRX); Token Contract
5,0x158079ee67fce2f58472a96584a73c7ab9ac95c1,0,Compound Augur (cREP); DeFi
6,0x6c8c6b02e7b2be14d4fa6022dfd6d75921d90e4e,0,Compound Basic Attention Token (cBAT); DeFi
7,0x5d3a536e4d6dbd6114cc1ead35777bab948e3643,0,Compound Dai (cDAI); DeFi
8,0x4ddc2d193948926d02f9b1fe9e1daa0718270ed5,0,Compound Ether (cETH); DeFi
9,0xf5dce57282a584d2746faf1593d3121fcac444dc,0,Compound Sai (cSAI); DeFi


exchanges

In [7]:
import pandas as pd
import re
from pathlib import Path

INPUT_CSV = "./verified/src/surajsjain_Ethereum_exchanges.csv"
OUTPUT_CSV = "./verified/processed/surajsjain_exchanges.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# читаем
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адрес
df["Address"] = df["Address"].str.strip().str.lower()

# фильтруем валидные ETH-адреса
df = df[df["Address"].str.match(ADDR_RE, na=False)].copy()

# для всех бирж is_scam = 0
df["is_scam"] = 0

# description = Name Tag + "; exchanges"
df["description"] = df["Name Tag"].str.strip()
df["description"] = df["description"].replace("", None)
df["description"] = df["description"].apply(
    lambda x: f"{x}; exchanges" if pd.notna(x) else "exchanges")

# итоговый набор
out = df.rename(columns={"Address": "address"})[
    ["address", "is_scam", "description"]]

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} адресов в {OUTPUT_CSV}")
out.head()

Сохранено 175 адресов в ./verified/processed/surajsjain_exchanges.csv


Unnamed: 0,address,is_scam,description
0,0x05f51aab068caa6ab7eeb672f88c180f67f17ec7,0,ABCC; exchanges
1,0x4df5f3610e2471095a130d7d934d551f3dde01ed,0,ATAIX; exchanges
2,0xadb72986ead16bdbc99208086bd431c1aa38938e,0,Beaxy; exchanges
3,0x7a10ec7d68a048bdae36a70e93532d31423170fa,0,Bgogo 1; exchanges
4,0xce1bf8e51f8b39e51c6184e059786d1c0eaf360f,0,Bgogo 2; exchanges


In [None]:
import pandas as pd
from pathlib import Path

INPUT_CSV = "./other/src/surajsjain_Etherscan_account_reports.csv"
OUTPUT_CSV = "./other/processed/surajsjain_account_reports.csv"

df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адрес
df["address"] = df["address"].str.strip().str.lower()

# ставим флаг скама
df["is_scam"] = 1

# description: если name_tag есть — добавляем "; etherscan reported"
# если пусто — null


def make_desc(name_tag):
    if name_tag.strip():
        return f"{name_tag.strip()}; etherscan reported"
    else:
        None


df["description"] = df["name_tag"].apply(make_desc)

# финальный датафрейм
out = df[["address", "is_scam", "description"]]

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} адресов в {OUTPUT_CSV}")
display(out.head())

Сохранено 5905 адресов в ./other/processed/surajsjain_account_reports.csv


Unnamed: 0,address,is_scam,description
0,0x26a40e8dbdb0dee17d7036fcc0a2ae3fecf4800d,1,
1,0x9f26ae5cd245bfeeb5926d61497550f79d9c6c1c,1,Akropolis Hacker 1; etherscan reported
2,0x2073d860b9b9e19f4d20eb0fe741fb5fe7fe6bb0,1,Artist Impersonator 14; etherscan reported
3,0xbceaa0040764009fdcff407e82ad1f06465fd2c4,1,Bancor Hacker; etherscan reported
4,0x03b70dc31abf9cf6c1cf80bfeeb322e8d3dbb4ca,1,Browser Extension Hack; etherscan reported


In [12]:
import pandas as pd
import re
from pathlib import Path

INPUT_CSV = "./other/src/surajsjain_Etherscan_scam_token_reports.csv"
OUTPUT_CSV = "./other/processed/surajsjain_scam_tokens.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адрес
df["address"] = df["address"].str.strip().str.lower()

# фильтруем только валидные eth-адреса
df = df[df["address"].str.match(ADDR_RE, na=False)].copy()

# формируем description (только token_name или None)
df["description"] = df["token_name"].apply(
    lambda x: x.strip() if x.strip() else None)

# итоговая таблица
out = (
    df[["address", "description"]]
    .drop_duplicates(subset=["address"])
    .assign(is_scam=1)[["address", "is_scam", "description"]]
    .reset_index(drop=True)
)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
print(out.head(10))

Сохранено 384 строк в ./other/processed/surajsjain_scam_tokens.csv
                                      address  is_scam  \
0  0xdb3e6295067be156219fd790d4b9c326f95368f1        1   
1  0x01012022d43a3f85196a6bea96dfdb7350fdaa3c        1   
2  0x2c9743a7d4b8d39884e8e919372157711d54cce1        1   
3  0x967582debedf069111973313e29bc7c5f713b80a        1   
4  0x7fb83740362bdf16efb6abe05348ad419270f7d3        1   
5  0x120aa018634f555484c088c8da80f75aa07e004f        1   
6  0x7cd0341d749d9edc352fa9899d0a8e07a955038f        1   
7  0x8202be32b8e045ce97418b0dbe77a7a1398dbe51        1   
8  0xd88e87cd53d5b3c88c07bdb4715a39b75d6e7870        1   
9  0x5c63396c9023bfe34c1cb97c55a07b73ce7a10cb        1   

                     description  
0               FDIC Coin (FDIC)  
1          Happy New Year (2022)  
2                  APPLE (APPLE)  
3            Uniswap V2 (UNI-V2)  
4           Wrapped Ether (WETH)  
5            aWETH.io (aWETH.io)  
6                     Spam Token  
7             

# Kaggle polarwolf

https://www.kaggle.com/datasets/polarwolf/ponzi-scheme-contracts-on-ethereum

In [13]:
import pandas as pd
import re
from pathlib import Path

INPUT_CSV = "./Ponzi/src/polarwolf_Ponzi_contracts.csv"
OUTPUT_CSV = "./Ponzi/processed/polarwolf_Ponzi_labels.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")

# нормализуем адрес
df["address"] = df["address"].str.strip().str.lower()

# фильтруем только валидные eth-адреса
df = df[df["address"].str.match(ADDR_RE, na=False)].copy()

# формируем description и is_scam
df["description"] = df["label"].apply(
    lambda x: "ponzi" if x.strip() == "1" else None)
df["is_scam"] = df["label"].apply(lambda x: 1 if x.strip() == "1" else 0)

# итоговая таблица
out = df[["address", "is_scam", "description"]].drop_duplicates(
    subset=["address"]).reset_index(drop=True)

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
print(out.head(10))

Сохранено 3786 строк в ./Ponzi/processed/polarwolf_Ponzi_labels.csv
                                      address  is_scam description
0  0x582b2489710a4189ad558b6958641789587fcc27        1       ponzi
1  0xeb4245c88c660ae4ee23c76954e5490ccd7bbd82        1       ponzi
2  0xd92d62ce8504e5c61aa17d9a9b13c65dbd77c268        1       ponzi
3  0xc352add7ad8cac8baa839d8c88e7e9d7df9a219b        1       ponzi
4  0x3f4dd010fbbc9a9b6d95f1f53837d7e9f3befac8        1       ponzi
5  0xf9533353c20495527e0499ac71e1507b418b9314        1       ponzi
6  0xe8b55deaced913c5c6890331d2926ea0fcbe59ac        1       ponzi
7  0x879716da78a75a44bdfa8f038ce875f99586940a        1       ponzi
8  0x6203188c0dd1a4607614dbc8af409e91ed46def0        1       ponzi
9  0x4e1833a4a67ed1c8cb0ffc541ab7291c02d2fd06        1       ponzi


# Github eltontay
https://github.com/eltontay/Ethereum-Fraud-Detection

In [19]:
import re
import pandas as pd
from pathlib import Path

INPUT_CSV = "./other/src/eltontay_address_data_combined.csv"
OUTPUT_CSV = "./other/processed/eltontay_address.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# Читаем только нужные колонки (остальные там с запятыми в заголовках)
df = pd.read_csv(INPUT_CSV, dtype=str, usecols=["Address", "FLAG"]).fillna("")

# Нормализуем адреса
df["Address"] = df["Address"].str.strip().str.lower()

# Оставляем только валидные eth-адреса
df = df[df["Address"].str.match(ADDR_RE, na=False)].copy()

# Флаг скама: 1 -> 1, иначе 0
df["is_scam"] = (df["FLAG"].astype(str).str.strip() == "1").astype(int)

# description всегда null (пустая ячейка)
df["description"] = None

# Финальная таблица
out = (
    df.rename(columns={"Address": "address"})[
        ["address", "is_scam", "description"]]
    .drop_duplicates(subset=["address"])
    .reset_index(drop=True)
)

# Сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
out.head(10)

Сохранено 13915 строк в ./other/processed/eltontay_address.csv


Unnamed: 0,address,is_scam,description
0,0x87d884aaa6ff9e9b6014631b0abae80b53953fb8,1,
1,0xd42393df90d582bd8a5493171f0173e3a017d391,1,
2,0x3025c36d8a9620d3df89e9e9b1acbdfd639a6f37,1,
3,0x6309f709faad518fc158af4c14edfa7b06424770,1,
4,0x3d020954e30c3d40b7f0c533cf198bc10dd45a49,1,
5,0xb300f2a0fa449b97a1069ea7ad654aca486f64b5,1,
6,0x83915e4ff807ba28b53931f923247c9fa0147eb9,1,
7,0x0f38daecb3fb7b87a8d3ed168822c1bc53e8202c,1,
8,0xcc320aa1fe572798a1f900cfa0df6524b04c9624,1,
9,0xe7860fd151cbbad28141dfd4cc7cd6d090e4ead8,1,


etherscan_parser
---------------------

This module implements a simple parser for Etherscan label data.  The goal of
the parser is to classify Ethereum addresses as either ``Scam`` or ``Legit`` and
export the results into a CSV file.  Since Etherscan restricts programmatic
access to some of its web pages behind Cloudflare, this parser operates on
publicly available label dumps hosted on GitHub¹.  Those dumps contain the
address, an optional human‑readable name and a list of label tags assigned by
Etherscan.  Unfortunately, there is no timestamp in the dumps, so we cannot
filter addresses by their creation or last‑update time.  If you need to limit
results to a certain date range, you must obtain that information from
Etherscan’s Metadata API (an enterprise offering) or scrape the live site
directly.

The classification logic below is heuristic.  It checks the name and all
associated label tags for keywords commonly associated with malicious activity
(``phish``, ``scam``, ``rugpull``, etc.).  Any match marks the address as a
``Scam``; otherwise the address is labelled ``Legit``.  You can adjust
``SCAM_KEYWORDS`` to refine this behaviour.

Usage:
    python etherscan_parser.py \
        --account-json-url <URL to combinedAccountLabels.json> \
        --token-json-url <URL to combinedTokenLabels.json> \
        --output <path to CSV file>

If you omit the URLs, the script falls back to the default dumps maintained
in the brianleect/etherscan-labels repository.  You can also point the
arguments at local files using ``file://`` URLs.  Note that downloading
remote files via HTTPS requires an environment that allows outbound web
connections.

The resulting CSV will contain the following columns:

    Address, Name, Account Type, Contract Type, Entity, Label, Tag1,
    Tag2, Tag3, Tag4, Tag5

``Tag1`` through ``Tag5`` contain up to five individual tags extracted from
the Etherscan label list.  If fewer tags are present, the remaining fields
will be empty.  ``Account Type`` is set to ``Smart Contract`` for token
addresses and ``Wallet`` for everything else.  ``Contract Type`` is ``Token``
for token addresses and empty for accounts.

¹ See: https://github.com/brianleect/etherscan-labels/tree/main/data/etherscan/combined

In [24]:
# Etherscan labels → CSV (dodgy/legit) — ноутбук-версия одной ячейкой

from __future__ import annotations
import urllib.request

import csv
import json
import logging
from pathlib import Path
from typing import Dict, Iterable, List, Tuple

import pandas as pd

# === ПАРАМЕТРЫ ===
ACCOUNT_JSON_URL = "https://raw.githubusercontent.com/brianleect/etherscan-labels/main/data/etherscan/combined/combinedAccountLabels.json"
TOKEN_JSON_URL = "https://raw.githubusercontent.com/brianleect/etherscan-labels/main/data/etherscan/combined/combinedTokenLabels.json"
OUTPUT_CSV_PATH = Path("./etherscan/src/etherscan_labels.csv")  # основной CSV
VERBOSE = True

# === ИМПОРТ HTTP-КЛИЕНТА (requests опционально) ===
try:
    import requests  # type: ignore
except Exception:
    requests = None  # type: ignore

# === КЛЮЧЕВЫЕ СЛОВА ДЛЯ ОБНАРУЖЕНИЯ DODGY ===
DODGY_KEYWORDS: Tuple[str, ...] = (
    "scam", "phish", "hack", "exploit", "rugpull", "rug pull", "fake", "fraud", "ponzi",
    "dusting", "malware", "hacker", "take-action", "hack alert"
)


def _download_json(source: str) -> Dict[str, Dict[str, List[str]]]:
    if source.startswith("file://"):
        path = Path(source[len("file://"):])
        with path.open("r", encoding="utf-8") as f:
            return json.load(f)
    if requests is not None:
        resp = requests.get(source, timeout=60)
        resp.raise_for_status()
        return resp.json()
    with urllib.request.urlopen(source, timeout=60) as r:
        return json.loads(r.read())


def _classify_address(name: str, labels: Iterable[str]) -> str:
    """Вернёт 'Dodgy' если совпадает с ключевыми словами, иначе 'Legit'."""
    name_lower = (name or "").lower()
    for kw in DODGY_KEYWORDS:
        if kw in name_lower:
            return "Dodgy"
    for lbl in labels:
        if any(kw in (lbl or "").lower() for kw in DODGY_KEYWORDS):
            return "Dodgy"
    return "Legit"


def _row_from_entry(address: str, entry: Dict[str, object], is_token: bool) -> List[str]:
    name: str = entry.get("name", "") or ""
    labels: List[str] = list(entry.get("labels", []))
    classification = _classify_address(name, labels)
    account_type = "Smart Contract" if is_token else "Wallet"
    contract_type = "Token" if is_token else ""
    entity = labels[0] if labels else ""
    tag_fields = [(labels[i] if i < len(labels) else "") for i in range(5)]
    return [address, name, account_type, contract_type, entity, classification] + tag_fields


def parse_etherscan_labels(account_json_url: str, token_json_url: str, output_csv_path: Path) -> None:
    if VERBOSE:
        logging.basicConfig(level=logging.INFO,
                            format="%(levelname)s: %(message)s")
        logging.info("Downloading account JSON: %s", account_json_url)
    account_data = _download_json(account_json_url)
    if VERBOSE:
        logging.info("Downloading token JSON:   %s", token_json_url)
    token_data = _download_json(token_json_url)

    combined: Dict[str, Tuple[Dict[str, object], bool]] = {}
    for addr, meta in account_data.items():
        combined[addr.lower()] = (meta, False)
    for addr, meta in token_data.items():
        combined[addr.lower()] = (meta, True)

    header = ["Address", "Name", "Account Type", "Contract Type",
              "Entity", "Label", "Tag1", "Tag2", "Tag3", "Tag4", "Tag5"]

    base = output_csv_path
    # legit_path = base.with_name(base.stem + "_legit" + base.suffix)
    # dodgy_path = base.with_name(base.stem + "_dodgy" + base.suffix)

    if VERBOSE:
        logging.info("Writing CSV: %s", base)

    with base.open("w", encoding="utf-8", newline="") as f_all:
        w_all = csv.writer(f_all)
        w_all.writerow(header)

        legit_cnt = dodgy_cnt = 0
        for address, (meta, is_token) in combined.items():
            row = _row_from_entry(address, meta, is_token)
            w_all.writerow(row)
            if row[5] == "Legit":
                legit_cnt += 1
            else:
                dodgy_cnt += 1

    if VERBOSE:
        logging.info("Done. Total: %d", len(combined))
        logging.info("Legit: %d | Dodgy: %d", legit_cnt, dodgy_cnt)


# === ЗАПУСК ===
parse_etherscan_labels(ACCOUNT_JSON_URL, TOKEN_JSON_URL, OUTPUT_CSV_PATH)

# === ПРЕВЬЮ ПЕРВЫХ 5 СТРОК ===
df_preview = pd.read_csv(OUTPUT_CSV_PATH)
df_preview.head()

INFO: Downloading account JSON: https://raw.githubusercontent.com/brianleect/etherscan-labels/main/data/etherscan/combined/combinedAccountLabels.json
INFO: Downloading token JSON:   https://raw.githubusercontent.com/brianleect/etherscan-labels/main/data/etherscan/combined/combinedTokenLabels.json
INFO: Writing CSV: etherscan\src\etherscan_labels.csv
INFO: Done. Total: 29772
INFO: Legit: 26637 | Dodgy: 3135


Unnamed: 0,Address,Name,Account Type,Contract Type,Entity,Label,Tag1,Tag2,Tag3,Tag4,Tag5
0,0x0e8ba001a821f3ce0734763d008c9d7c957f5852,AmadeusRelay,Wallet,,0x-protocol-ecosystem,Legit,0x-protocol-ecosystem,dex,,,
1,0xc898fbee1cc94c0ff077faa5449915a506eff384,Bamboo Relay,Wallet,,0x-protocol-ecosystem,Legit,0x-protocol-ecosystem,dex,,,
2,0x58a5959a6c528c5d5e03f7b9e5102350e24005f1,ERC dEX,Wallet,,0x-protocol-ecosystem,Legit,0x-protocol-ecosystem,dex,,,
3,0x2cc42d1cd65af27cc999e41ef93d1a763dc821f8,IDT Exchange,Wallet,,0x-protocol-ecosystem,Legit,0x-protocol-ecosystem,dex,,,
4,0x4524baa98f9a3b9dec57caae7633936ef96bd708,LedgerDex,Wallet,,0x-protocol-ecosystem,Legit,0x-protocol-ecosystem,dex,,,


In [25]:
import re
import pandas as pd
from pathlib import Path

INPUT_CSV = "./etherscan/src/etherscan_labels.csv"
OUTPUT_CSV = "./etherscan/processed/etherscan_labels_processed.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")

# читаем как строки
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")
df.columns = df.columns.str.strip()

# нормализуем адрес
df["Address"] = df["Address"].str.strip().str.lower()

# оставляем только валидные адреса
df = df[df["Address"].str.match(ADDR_RE, na=False)].copy()

# is_scam по Label
df["is_scam"] = df["Label"].str.strip().str.lower().eq("dodgy").astype(int)

# соберём description = Name + Tag1..Tag5 (через "; "), пустые — игнор
tag_cols = [c for c in df.columns if c.lower().startswith("tag")]


def make_desc(row):
    parts = [row.get("Name", "").strip()]
    parts += [str(row.get(c, "")).strip() for c in tag_cols]
    parts = [p for p in parts if p]
    return "; ".join(parts) if parts else None


df["description"] = df.apply(make_desc, axis=1)

# финальная таблица
out = (
    df.rename(columns={"Address": "address"})[
        ["address", "is_scam", "description"]]
    .drop_duplicates(subset=["address"])
    .reset_index(drop=True)
)

# сохраняем и показываем head
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUTPUT_CSV, index=False)

print(f"Сохранено {len(out)} строк в {OUTPUT_CSV}")
out.head(10)

Сохранено 29772 строк в ./etherscan/processed/etherscan_labels_processed.csv


Unnamed: 0,address,is_scam,description
0,0x0e8ba001a821f3ce0734763d008c9d7c957f5852,0,AmadeusRelay; 0x-protocol-ecosystem; dex
1,0xc898fbee1cc94c0ff077faa5449915a506eff384,0,Bamboo Relay; 0x-protocol-ecosystem; dex
2,0x58a5959a6c528c5d5e03f7b9e5102350e24005f1,0,ERC dEX; 0x-protocol-ecosystem; dex
3,0x2cc42d1cd65af27cc999e41ef93d1a763dc821f8,0,IDT Exchange; 0x-protocol-ecosystem; dex
4,0x4524baa98f9a3b9dec57caae7633936ef96bd708,0,LedgerDex; 0x-protocol-ecosystem; dex
5,0xc22d5b2951db72b44cfb8089bb8cd374a3c354ea,0,OpenRelay; 0x-protocol-ecosystem; dex
6,0xd2045edc40199019e221d71c0913343f7908d0d5,0,Paradex; 0x-protocol-ecosystem; dex
7,0xa258b39954cef5cb142fd567a46cddb31a670124,0,Radar Relay; 0x-protocol-ecosystem; dex
8,0x55890b06f0877a01bb5349d93b202961f8e27a9b,0,Shark Relay; 0x-protocol-ecosystem; dex
9,0x0681e844593a051e2882ec897ecd5444efe19ff2,0,Star Bit Ex; 0x-protocol-ecosystem; dex


# Merged datasets

In [26]:
import re
import os
import glob
import pandas as pd
from pathlib import Path

# === CONFIG ===
ROOT_DIR = "./"   # корень, где лежат твои папки с подкаталогами processed/
OUTPUT_CSV = "./aggregated/addresses_merged.csv"

ADDR_RE = re.compile(r"^0x[a-fA-F0-9]{40}$")


def normalize_desc(x: str):
    if x is None:
        return None
    s = str(x).strip()
    if not s:
        return None
    if s.lower() in {"null", "none", "nan"}:
        return None
    return s


# найдём все processed/*.csv
pattern = os.path.join(ROOT_DIR, "**", "processed", "*.csv")
files = sorted(glob.glob(pattern, recursive=True))

if not files:
    raise FileNotFoundError(f"Не нашёл ни одного CSV по маске: {pattern}")

rows = []
order = 0
for path in files:
    try:
        df = pd.read_csv(path, dtype=str).fillna("")
    except Exception as e:
        print(f"⚠️ Пропускаю {path}: ошибка чтения -> {e}")
        continue

    # минимальная валидация колонок
    required = {"address", "is_scam", "description"}
    if not required.issubset(set(df.columns)):
        print(f"⚠️ Пропускаю {path}: нет нужных колонок {required}")
        continue

    df = df[["address", "is_scam", "description"]].copy()

    # нормализуем
    df["address"] = df["address"].str.strip().str.lower()
    df = df[df["address"].str.match(ADDR_RE, na=False)]

    # приведение типов
    df["is_scam"] = pd.to_numeric(
        df["is_scam"], errors="coerce").fillna(0).astype(int)
    df["description"] = df["description"].apply(normalize_desc)

    # порядок источника для устойчивого выбора "первой" записи
    df["__src_order"] = order
    order += 1

    rows.append(df)

# склеиваем
big = pd.concat(rows, ignore_index=True)
print(f"Считано {len(files)} файлов, всего строк: {len(big)}")

# признак "есть описание"
big["__has_desc"] = big["description"].notna() & (
    big["description"].astype(str).str.len() > 0)

# сортируем по приоритетам:
#  - address ASC
#  - is_scam DESC  (1 важнее 0)
#  - __has_desc DESC (с описанием важнее)
#  - __src_order ASC (что встретилось раньше — важнее)
big_sorted = big.sort_values(
    by=["address", "is_scam", "__has_desc", "__src_order"],
    ascending=[True, False, False, True],
    kind="mergesort",  # стабильная сортировка
)

# берём первую запись на каждый адрес по приоритетам выше
merged = big_sorted.drop_duplicates(subset=["address"], keep="first").drop(
    columns=["__src_order", "__has_desc"])

# сохраняем
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
merged.to_csv(OUTPUT_CSV, index=False)

print(f"Итог: {len(merged)} уникальных адресов → {OUTPUT_CSV}")
merged.head(10)

Считано 25 файлов, всего строк: 3151814
Итог: 3074622 уникальных адресов → ./aggregated/addresses_merged.csv


Unnamed: 0,address,is_scam,description
16817,0x0000000000000000000000000000000000000000,0,Null Address: 0x000...000; blocked; burn; genesis
16818,0x0000000000000000000000000000000000000001,0,Null Address: 0x000...001; blocked; burn
17290,0x0000000000000000000000000000000000000002,0,Null: 0x000...002; burn; genesis
17291,0x0000000000000000000000000000000000000003,0,Null: 0x000...003; burn; genesis
17292,0x0000000000000000000000000000000000000004,0,Null: 0x000...004; burn; genesis
17293,0x0000000000000000000000000000000000000005,0,Null: 0x000...005; burn; genesis
17294,0x0000000000000000000000000000000000000006,0,Null: 0x000...006; burn; genesis
17295,0x0000000000000000000000000000000000000007,0,Null: 0x000...007; burn; genesis
17296,0x0000000000000000000000000000000000000008,0,Null: 0x000...008; burn; genesis
17297,0x0000000000000000000000000000000000000009,0,Null: 0x000...009; burn; genesis


In [27]:
# Подсчёт количества скама и чистых адресов
counts = merged["is_scam"].value_counts().rename(index={0: "clean", 1: "scam"})

print("=== Статистика по адресам ===")
for label, cnt in counts.items():
    print(f"{label}: {cnt}")

# или в виде DataFrame для наглядности
counts_df = counts.reset_index()
counts_df.columns = ["type", "count"]
counts_df

=== Статистика по адресам ===
clean: 3019969
scam: 54653


Unnamed: 0,type,count
0,clean,3019969
1,scam,54653
