In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set(font_scale = 2)
sns.set_style("whitegrid")
LW = 3
MS = 10

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
from ethprivacy.entity_api import EntityAPI
from ethprivacy.topic_analysis import *
from ethprivacy.tornado_mixer import TornadoQueries

In [None]:
export_figs = True

# 1.) Initialize EntityAPI

In [None]:
data_dir = "../data/"
output_dir = "../results/"

In [None]:
img_dir = "%s/figs" % output_dir
if export_figs and not os.path.exists(img_dir):
    os.makedirs(img_dir)

In [None]:
api_all = EntityAPI(data_dir, only_pos_tx=False)

# 2.) Data check

### Activity plots

In [None]:
df = api_all.events
df["day"] = (df["timeStamp"] // 86400) * 86400
df = df[df["day"] > df["day"].max()-86400*365]
df["is_token"] = df["tx_type"] == "token"

In [None]:
activity_df = df.groupby(["day","is_token"])["hash"].count().reset_index()
activity_df = activity_df.rename({"hash":"count"}, axis=1)

In [None]:
g = sns.lineplot(data=activity_df, x="day", y="count", hue="is_token")

### Gas price

In [None]:
gas_price_df = df.groupby(["day"])["gasPrice"].mean().reset_index()

In [None]:
g = sns.lineplot(data=gas_price_df, x="day", y="gasPrice")

### Address statistics

In [None]:
print("Unique contract addresses", len(api_all.events["contractAddress"].unique()))

In [None]:
print("Unique addresses", len(set(api_all.events["from"]).union(set(api_all.events["to"]))))
print("Unique senders", len(set(api_all.events["from"])))
print("Unique receivers", len(set(api_all.events["to"])))

In [None]:
sent_tx_cnt = dict(api_all.events["from"].value_counts())

### Addresses of interest

In [None]:
addresses, ens_addresses, tornado_addresses, humanity_dao_addresses = addresses_of_interest(api_all, with_tornado=True, with_hd=True)

### Sent/received tx cnt for the 3 data source

In [None]:
df = api_all.events

In [None]:
num_sent_txs = df["from"].value_counts()
num_received_txs = df["to"].value_counts()

In [None]:
records = []
for set_name, addr_set in [("Twitter",ens_addresses),("Tornado",tornado_addresses),("Humanity-Dao",humanity_dao_addresses)]:
    keys = set(addr_set).intersection(num_sent_txs.index)
    records.append([set_name, num_sent_txs.loc[keys].mean(), "sent"])
    keys = set(addr_set).intersection(num_received_txs.index)
    records.append([set_name, num_received_txs.loc[keys].mean(), "received"])
mean_txs_cnt = pd.DataFrame(records, columns=["Source","Average tx count","Direction"])

In [None]:
plt.figure(figsize=(8,6))
g = sns.barplot(data=mean_txs_cnt, x="Source", y="Average tx count", hue="Direction")
if export_figs:
    plt.savefig("%s/avg_txs.pdf" % img_dir, format='pdf', bbox_inches='tight')

### Number of addresses per ENS

In [None]:
cnt_map = dict(api_all.ens_pairs["name"].value_counts().value_counts())
more_map = cnt_map.copy()
del more_map[1]
del more_map[2]
del more_map[3]
del more_map[4]
del more_map[5]

In [None]:
cols = ['1','2','3','4','5','more']
vals = [cnt_map[1],cnt_map[2],cnt_map[3],cnt_map[4],cnt_map[5],sum(more_map.values())]
plt.figure(figsize=(8,6))
plt.bar(cols,vals,color=['b','g','b','b','b','b'])
plt.yscale('log')
plt.xlabel("Unique addresses per ENS name")
plt.ylabel("Number of ENS names")
plt.yticks([10,10**2])
if export_figs:
    plt.savefig("%s/addr_cnt_for_ens.pdf" % img_dir, format='pdf', bbox_inches='tight')

# 3. Tornado mixers

In [None]:
max_time = api_all.events["timeStamp"].max()
print(max_time)

In [None]:
tq0_1 = TornadoQueries(mixer_str_value="0.1", max_time=max_time, data_folder=data_dir)
tq1 = TornadoQueries(mixer_str_value="1", max_time=max_time, data_folder=data_dir)
tq10 = TornadoQueries(mixer_str_value="10", max_time=max_time, data_folder=data_dir)
tq100 = TornadoQueries(mixer_str_value="100", max_time=max_time, data_folder=data_dir)

In [None]:
import matplotlib.dates as mdates

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
plt.title("Number of total deposits")
tq0_1.plot_num_deposits(show_heuristics=False, linew=LW, msize=MS)
tq1.plot_num_deposits(show_heuristics=False, linew=LW, msize=MS)
tq10.plot_num_deposits(show_heuristics=False, linew=LW, msize=MS)
tq100.plot_num_deposits(show_heuristics=False, linew=LW, msize=MS)
ax.xaxis.set_major_formatter(mdates.DateFormatter("%d %b"))
plt.legend(bbox_to_anchor=(1.02, 1.0))
if export_figs:
    plt.savefig("%s/tornado_active_deposits.pdf" % img_dir, format='pdf', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
plt.title("Heuristics")
tq0_1.plot_num_deposits(show_heuristics=True)
tq1.plot_num_deposits(show_heuristics=True)
tq10.plot_num_deposits(show_heuristics=True)
tq100.plot_num_deposits(show_heuristics=True)
ax.xaxis.set_major_formatter(mdates.DateFormatter("%d %b"))
plt.yscale("log")
plt.ylabel("Mixer value (ETH)")
plt.yticks([0.1,1.0,10.0,100.0])
plt.legend(bbox_to_anchor=(1.02, 1.0))
if export_figs:
    plt.savefig("%s/tornado_heuristics.pdf" % img_dir, format='pdf', bbox_inches='tight')

# 4.) Topic Analysis

In [None]:
api = EntityAPI(data_dir, only_pos_tx=False, address_filter="ens")

## i.) Load topics

In [None]:
topic_for_addr, name_for_addr = load_address_topics("%s/labeledAddresses.json" % data_dir)

In [None]:
selected_addresses = list(topic_for_addr.keys())

## ii.) Find these addresses in our data

In [None]:
info_records = [api.address_info(addr) for addr in tqdm(selected_addresses)]
selected_addr_info = pd.DataFrame(info_records)

In [None]:
selected_addr_info["topic"] = selected_addr_info["address"].apply(lambda x: topic_for_addr[x])
selected_addr_info["name"] = selected_addr_info["address"].apply(lambda x: name_for_addr[x])

In [None]:
selected_addr_info["has_event"] = selected_addr_info.apply(lambda x: x["is_contract"] or x["normal_in"] or x["normal_out"] or x["token_in"] or x["token_out"], axis=1)

In [None]:
selected_addr_info["has_event"].value_counts() / len(selected_addr_info)

### Keep only addresses with events

In [None]:
selected_addr_info = selected_addr_info[selected_addr_info["has_event"]]
selected_addr_info.shape

### Topic distribution for interacted addresses

In [None]:
selected_addr_info["topic"].value_counts()

In [None]:
selected_addr_info

## iii.) Discover address connections

Find the set of ens names that were in contact with these interesting addresses

In [None]:
selected_addr_info["name"] = selected_addr_info["name"].replace({"gemini1":"Gemini",'EtherDelta 2':'EtherDelta','bittrex1':'Bittrex'})

In [None]:
inbound, outbound = get_in_out_ens_connections(api, selected_addr_info)

### Merge a few services

In [None]:
sorted(selected_addr_info[selected_addr_info["topic"]=="Exchange"]["name"].unique())

In [None]:
to_be_merged = {
    "Exchange":{
        "Poloniex":['Poloniex1','Poloniex4'],
        "Binance":['binance2','binance3','binance4','binance5','binance6'],
        "Kraken":['kraken1','kraken2','kraken4'],
        "Okex":['okex1','okex2']
    },
    "Gaming":{
        "CryptoKitties":['CryptoKitties','CryptoKitties String Auction','Cryptokitties Sales']
    }
}

In [None]:
for topic in to_be_merged:
    topic_merges = to_be_merged[topic]
    for key in topic_merges:
        merged = set()
        for ens_dict in [inbound, outbound]:
            for val in topic_merges[key]:
                merged = merged.union(ens_dict[topic][val])
                del ens_dict[topic][val]
            ens_dict[topic][key] = merged

In [None]:
num_uniq_ens = len(api.ens_pairs["name"].unique())
num_uniq_ens

In [None]:
total_for_names = calculate_ens_coverage(inbound, outbound, num_uniq_ens)
total_for_topics = calculate_ens_coverage(inbound, outbound, num_uniq_ens, result_type="topic")

In [None]:
total_for_topics["frac_order"] = total_for_topics["frac"].rank(ascending=False)
total_for_names["frac_order"] = total_for_names["frac"].rank(ascending=False)
total_for_names["name - topic"] = total_for_names[["name","topic"]].apply(lambda x: str(x["name"]) + " - " + str(x["topic"]), axis=1)

In [None]:
num_uniq_ens, len(api.ens_pairs)

## iv.) Visualize

### a.) Topic statistics

In [None]:
df = total_for_topics.sort_values("frac_order").reset_index().drop(0)
fig, ax = plt.subplots(1,1,figsize=(8,15))
g = sns.barplot(data=df, y="topic",x="frac", orient="h", ax=ax)
g.set(xlabel="Fraction of collected ENS names")
g.set(xlim=(0,df["frac"].max()+0.05))
g.set(xticks=np.arange(0,df["frac"].max()+0.05,0.1))
g.set(ylabel="Service category")
if export_figs:
    plt.savefig("%s/ens_topic_distrib.pdf" % img_dir, format='pdf', bbox_inches='tight')

In [None]:
for category in ["StableCoins","Platform","Defi","Exchange","Gaming","Collectibles","Trading"]:
    df = total_for_names[total_for_names["topic"]==category].sort_values("frac_order").head(5)
    fig, ax = plt.subplots(1,1,figsize=(8,len(df)))
    plt.title(category)
    g = sns.barplot(data=df, y="name",x="frac", orient="h", ax=ax)
    g.set(xlabel="Fraction of collected ENS names")
    g.set(ylabel="Service name")
    g.set(xlim=(0,df["frac"].max()+0.05))
    g.set(xticks=np.arange(0,df["frac"].max()+0.05,0.1))
    if export_figs:
        plt.savefig("%s/ens_%s_distrib.pdf" % (img_dir, category.lower()), format='pdf', bbox_inches='tight')