# Purpose of this notebook
We have `yaml` files retrieved from different sources and these files contain different names for the same entity. Ccategories need to be checked as well.

This notebook creates a mapping from labels to unique labels (`label_uniqueLabel.json`) and a mapping from unique labels to the right categories (`uniqueLabel_category.json`).

Every time a yaml file is created or added to the list, its labels must be mapped to unique labels and its categories to the right categories using these two `json` files.

`config.yaml` is also updated with the new list of categories.

# Step \#1. Create a file mapping entity label to its category (skip blockchain.info)

In [5]:
import yaml
import json
from os import listdir

In [63]:
# for each yaml file
# get all the <label,category> pairs
label_category = dict() # initial (dirty) label-category mapping
packs_folder = '../packs/'
for file in listdir(packs_folder):
    if file != 'blockchaininfo.yaml': # there are no categories there
        print(file)
        with open(packs_folder + file) as fp:
            data = yaml.safe_load(fp)
        for el in data['tags']:
            if 'category' in el and 'label' in el:
                label_category[el['label'].lower()] = el['category']
            elif 'category' not in el and 'label' in el:
                label_category[el['label'].lower()] = data['category']
            elif 'category' in el and 'label' not in el:
                label_category[data['label'].lower()] = el['category']
            elif 'category' in data and 'label' in data:
                label_category[data['label'].lower()] = data['category']


walletexplorer.yaml
richest_addresses.yaml
ransomware.yaml
binance.yaml
binance_hack.yaml
sextortion_talos.yaml
miners.yaml
demo.yaml
ponzi_scheme.yaml


## Step #1a. Check uniqueness of labels


In [64]:
sorted_labels = [e.lower() for e in label_category.keys()]
sorted_labels.sort()

In [67]:
# merge all labels with same root (these were manually selected by looking at all the (sorted) dirty labels above)
labels_roots = ['cryptosplit', 'coindouble', '50btc', 'alphabaymarket', 'antpool', 'banx.io', 'betcoin.ag', 'bitaces', 'bitcoin-24.com', 'bitcoin.de', 'bitcoinica.com', 'bitcoinvideocasino.com', 'bitfinex', 'bitfury', 'bitminter', 'bitpay.com', 'bitstamp', 'bittrex', 'btc-e.com', 'btcc.com', 'btcjam.com', 'bter.com', 'btradeaustralia.com', 'c-cex.com', 'campbx.com', 'chainroll.com', 'cloudhashing', 'coinapult.com', 'coinroyale.com', 'cryptonator', 'cryptonit.net', 'cryptopay.me', 'cryptsy.com', 'deepbit', 'dgex.com', 'dmalocker', 'eclipsemc', 'eligius', 'foxbit.com.br', 'gatecoin.com', 'helixmixer', 'hitbtc.com', 'huobi.com', 'just-dice.com', 'kano', 'kncminer', 'kraken', 'localbitcoins.com', 'luckyb.it', 'okcoin.com', 'polmine', 'poloniex', 'primedice.com', 'satoshi-karoshi.com', 'satoshidice.com', 'securevpn.to', 'simplecoin.cz', 'slushpool', 'simplecoin', 'therocktrading.com', 'xapo', 'zyado.com']

In [69]:
# for l in sorted_labels:
#     root_found = False
#     for root in labels_roots:
#         if l.startswith(root):
#             root_found = True
#             break
#     if not root_found:
#         print(l)

## Step #1b. Create unique labels

In [70]:
# map each label to unique label
label_uniqueLabel = dict() # map a label to a unique one
# founded manually
label_uniqueLabel['btccpool'] = 'btcc pool'
label_uniqueLabel['btc guild'] = 'btcguild.com'
label_uniqueLabel['globev2'] = 'globe'
label_uniqueLabel['globev3'] = 'globe'
label_uniqueLabel['telco214'] = 'telco 214'
for label in sorted_labels: # all labels available, similars included
    if label not in label_uniqueLabel:
        root_found = False
        for root in labels_roots: # look for a possible root
            if label.startswith(root): # if found, use it and go to next label
                label_uniqueLabel[label] = root
                root_found = True
                break
        if not root_found:
            label_uniqueLabel[label] = label

In [71]:
# create mapping unique_label to category
uniqueLabel_category = dict()

In [72]:
# check if by using different versions of the same label, one entity is mapped into different categories
# create a set of categories for each unique label, then check its len
for label in label_category: # all labels, similars included
    unique_label = label_uniqueLabel[label]
    if unique_label not in uniqueLabel_category:
        uniqueLabel_category[unique_label] = set()
    uniqueLabel_category[unique_label].add(label_category[label])

# Step \#2. Manual entity check and cleaning

In [73]:
# check len of each set of categories for each unique label
# in len == 1: use that category, else manual fix
for unique_label in uniqueLabel_category:
    if len(uniqueLabel_category[unique_label]) > 1:
        print(uniqueLabel_category[unique_label], unique_label)
    else:
        uniqueLabel_category[unique_label] = list(uniqueLabel_category[unique_label])[0]

In [43]:
# # Manual labeling
# uniqueLabel_category['cloudhashing'] = 'Miner'
# uniqueLabel_category['btcguild.com'] = 'Miner'
# uniqueLabel_category['polmine'] = 'Miner'
# uniqueLabel_category['50btc'] = 'Miner'
# uniqueLabel_category['deepbit'] = 'Miner'
# uniqueLabel_category['xapo'] = 'Exchange'

In [74]:
# # some entities fall into the "old" or "other" categories, but we want to know what they were doing 
# # so google each of them and understand what they actually are/were doing
# for ul in uniqueLabel_category:
#     if uniqueLabel_category[ul] == 'Old/historic':
#         print(ul)

In [76]:
# manual work: checked each entity and decided what its category was
uniqueLabel_category['bitlaunder.com'] = 'Mixingservice'
uniqueLabel_category['bitnz.com'] = 'Exchange'
uniqueLabel_category['coinurl.com'] = 'Scam'
uniqueLabel_category['btcpop.co'] = 'Exchange'
uniqueLabel_category['gocelery.com'] = 'Exchange'
uniqueLabel_category['strongcoin.com'] = 'Walletprovider'
uniqueLabel_category['coinapult.com'] = 'Walletprovider'
uniqueLabel_category['doctordmarket'] = 'Marketplace'
uniqueLabel_category['paymium.com'] = 'Exchange'
uniqueLabel_category['germanplazamarket'] = 'Walletprovider'
uniqueLabel_category['greenroadmarket'] = 'Walletprovider'
uniqueLabel_category['epay.info'] = 'Exchange'
uniqueLabel_category['bitcoinwallet.com'] = 'Walletprovider'
uniqueLabel_category['oklink.com'] = 'Exchange'
uniqueLabel_category['helixmixer'] = 'Mixingservice'
uniqueLabel_category['holytransaction.com'] = 'Walletprovider'
uniqueLabel_category['coinkite.com'] = 'Walletprovider'
uniqueLabel_category['alphabaymarket'] = 'Marketplace'
uniqueLabel_category['nucleusmarket'] = 'Marketplace'
uniqueLabel_category['bitcoinfog'] = 'Mixingservice'
uniqueLabel_category['coinjar.com'] = 'Walletprovider'
uniqueLabel_category['cryptopay.me'] = 'Walletprovider'
uniqueLabel_category['coinpayments.net'] = 'Exchange'
uniqueLabel_category['cubits.com'] = 'Exchange'
uniqueLabel_category['cryptonator'] = 'Walletprovider'
uniqueLabel_category['blueskymarketplace'] = 'Marketplace'
uniqueLabel_category['blackbankmarket'] = 'Marketplace'
uniqueLabel_category['sheepmarketplace'] = 'Marketplace'
uniqueLabel_category['middleearthmarketplace'] = 'Marketplace'
uniqueLabel_category['pandoraopenmarket'] = 'Marketplace'
uniqueLabel_category['abraxasmarket'] = 'Marketplace'
uniqueLabel_category['evolutionmarket'] = 'Marketplace'
uniqueLabel_category['silkroad2market'] = 'Marketplace'
uniqueLabel_category['silkroadmarketplace'] = 'Marketplace'
uniqueLabel_category['agoramarket'] = 'Marketplace'
uniqueLabel_category['inputs.io'] = 'Walletprovider'
uniqueLabel_category['instawallet.org'] = 'Walletprovider'
uniqueLabel_category['betcoins.net'] = 'Gambling'
uniqueLabel_category['bitaces'] = 'Gambling'
uniqueLabel_category['pinballcoin.com'] = 'Gambling'
uniqueLabel_category['diceoncrack.com'] = 'Gambling'
uniqueLabel_category['btcdice.com'] = 'Gambling'
uniqueLabel_category['sealswithclubs.eu'] = 'Gambling'
uniqueLabel_category['updown.bt'] = 'Gambling'
uniqueLabel_category['betcoindice.tm'] = 'Gambling'
uniqueLabel_category['bitcoin-24.com'] = 'Exchange'
uniqueLabel_category['allcoin.com'] = 'Exchange'
uniqueLabel_category['justcoin.com'] = 'Exchange'
uniqueLabel_category['coin-swap.net'] = 'Exchange'
uniqueLabel_category['mcxnow.com'] = 'Scam'
uniqueLabel_category['mintpal.com'] = 'Scam'
uniqueLabel_category['bitmillions.com'] = 'Gambling'
uniqueLabel_category['mybitcoin.com'] = 'Walletprovider'
uniqueLabel_category['bitmit.net'] = 'Service/Other'
uniqueLabel_category['cannabisroadmarket'] = 'Marketplace'
uniqueLabel_category['bitelfin.com'] = 'Gambling'
uniqueLabel_category['bitcoin-roulette.com'] = 'Gambling'
uniqueLabel_category['betcoins.net'] = 'Gambling'
uniqueLabel_category['actioncrypto.com'] = 'Gambling'
uniqueLabel_category['allcrypt.com'] = 'Exchange'
uniqueLabel_category['bityes.com'] = 'Exchange'
uniqueLabel_category['vaultofsatoshi.com'] = 'Exchange'
uniqueLabel_category['crypto-trade.com'] = 'Exchange'
uniqueLabel_category['coin.mx'] = 'Exchange'
uniqueLabel_category['cryptorush.in'] = 'Exchange'
uniqueLabel_category['dagensia.eu'] = 'Exchange'
uniqueLabel_category['leancy.com'] = 'Scam'
uniqueLabel_category['comkort.com'] = 'Exchange'
uniqueLabel_category['dispenser.tf'] = 'Service/Other'
uniqueLabel_category['playt.in'] = 'Service/Other'
uniqueLabel_category['bitcoinica.com'] = 'Service/Other'
uniqueLabel_category['coinvault'] = 'Walletprovider'
uniqueLabel_category['masterxchange.com'] = 'Exchange'
uniqueLabel_category['bitoomba.com'] = 'Scam'
uniqueLabel_category['dicebitco.in'] = 'Gambling'
uniqueLabel_category['ice-dice.com'] = 'Gambling'
uniqueLabel_category['pocketrocketscasino.eu'] = 'Gambling'
uniqueLabel_category['betsofbitco.in'] = 'Gambling'
uniqueLabel_category['chainroll.com'] = 'Gambling'
uniqueLabel_category['btcst.com-pirateat40'] = 'Scam'
uniqueLabel_category['coinmkt.com'] = 'Exchange'
uniqueLabel_category['minerscenter.com'] = 'Service/Other'
uniqueLabel_category['coinhub.cz'] = 'Exchange'
uniqueLabel_category['btceur.eu'] = 'Service/Other'
uniqueLabel_category['smenarnabitcoin.cz'] = 'Exchange'
uniqueLabel_category['brawker.com'] = 'Exchange'
uniqueLabel_category['suzukidice.com'] = 'Gambling'
uniqueLabel_category['everydice.com'] = 'Gambling'
uniqueLabel_category['cryptobounty.com'] = 'Service/Other'
uniqueLabel_category['europex.eu'] = 'Service/Other'
uniqueLabel_category['birwo.com-old'] = 'Gambling'
uniqueLabel_category['10xbitco.in'] = 'Scam'
uniqueLabel_category['admiralcoin.com'] = 'Gambling'
uniqueLabel_category['just-dice.com'] = 'Gambling'
uniqueLabel_category['dadice.com'] = 'Gambling'
uniqueLabel_category['mpex.co'] = 'Exchange'
uniqueLabel_category['btcexchange.ro'] = 'Exchange'
uniqueLabel_category['dgex.com'] = 'Exchange'
uniqueLabel_category['btct.com'] = 'Exchange'
uniqueLabel_category['babylonmarket'] = 'Marketplace'
uniqueLabel_category['cryptomine.io'] = 'Service/Other'
uniqueLabel_category['cryptcominer.com'] = 'Service/Other'
uniqueLabel_category['bitcoinwebank.com'] = 'Service/Other'
uniqueLabel_category['coin-sweeper.com'] = 'Service/Other'
uniqueLabel_category['bitcash.cz'] = 'Service/Other'
uniqueLabel_category['ponzicoin.co'] = 'Ponzi Scheme'


In [77]:
for ul in uniqueLabel_category:
    if uniqueLabel_category[ul] == 'Services/others':
        uniqueLabel_category[ul] = 'Service/Other'
    elif uniqueLabel_category[ul] == 'Walletprovider':
        uniqueLabel_category[ul] = 'Wallet Service'
    elif uniqueLabel_category[ul] == 'Mixingservice':
        uniqueLabel_category[ul] = 'Mixing Service'


In [78]:
set(uniqueLabel_category.values())

{'Exchange',
 'Gambling',
 'Hack',
 'Marketplace',
 'Miner',
 'Mixing Service',
 'Organization',
 'Ponzi Scheme',
 'Ransomware',
 'Scam',
 'Service/Other',
 'Sextortion',
 'Wallet Service'}

# Step \#3. Write files

In [79]:
with open('label_uniqueLabel.json', 'w') as fp:
    json.dump(label_uniqueLabel, fp)

In [80]:
with open('uniqueLabel_category.json', 'w') as fp:
    json.dump(uniqueLabel_category, fp)

In [81]:
# read each file
# map label to unique label
# use only correct categories

for file in listdir(packs_folder):
    if file != 'blockchaininfo.yaml':
        print(file)
        with open(packs_folder + file) as fp:
            data = yaml.safe_load(fp)
        for el in data['tags']:
            # clean label
            if 'label' in el:
                unique_label = label_uniqueLabel[el['label'].lower()]
                el['label'] = unique_label
            else:
                unique_label = data['label'].lower()
            # clean category
            if 'category' in el:
                el['category'] = uniqueLabel_category[unique_label]
        with open(packs_folder + file, 'w') as fp:
            yaml.dump(data, fp, default_flow_style=False)

walletexplorer.yaml
richest_addresses.yaml
ransomware.yaml
binance.yaml
binance_hack.yaml
sextortion_talos.yaml
miners.yaml
demo.yaml
ponzi_scheme.yaml


In [82]:
# update config.yaml
config_path = '../config.yaml'
with open(config_path) as fp:
    config = yaml.safe_load(fp)

In [83]:
config['categories'] = list(set(uniqueLabel_category.values()))

In [84]:
with open(config_path, 'w') as fp:
    yaml.dump(config, fp, default_flow_style=False)