# Data Extraction
We had to first collect a decent number of random wallets from Etherscan and then retrieve the necessary predictors.

## Attempt 1: Web Scraper

We first attempted to use a website scraper to retrieve data from Etherscan. We learnt how to write a website scraper and implemented it for our use. 

In [None]:
# In command line (example:)
# C:\Users\your name\AppData\Local\Programs\Python\Python39\Scripts>
# pip install requests
# pip install beautifulsoup4

import csv
import sys
import datetime
import requests
from time import sleep
from bs4 import BeautifulSoup

def scraper(num_pages=4, req_delay=0.1):
  timestamp = datetime.datetime.now().strftime ("%Y%m%d_%H%M%S")
  
  print("%d pages to parse with delay of %f seconds between each page" % (num_pages, req_delay))
  api_url = "https://etherscan.io/contractsVerified/"
  
  with open('ExchangeAccounts-'+timestamp+'.csv', 'w') as csvfile:
    fieldnames = ['addr', 'name_tag', 'balance', 'tx_count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i in range(0, num_pages):
      url = api_url + str('?subcatid=undefined&size=100&start=') + str(i*100) + str('&col=1&order=asc')
      sleep(req_delay)
      response = requests.get(url)
      print("URL: %s, Status: %s" % (url, response.status_code))

      content = response.content
      soup = BeautifulSoup(content, 'html.parser')

      for row in soup.select('table.table-hover tbody tr'):
        cells = row.findAll('td')
        cells = map(lambda x: x.text, cells)
        addr, name_tag, balance, tx_count = cells
        writer.writerow({
          'addr': addr,
          'name_tag': name_tag,
          'balance': balance,
          'tx_count': tx_count,
        })

def main():
  if len(sys.argv) > 2:
    scraper(int(sys.argv[1]), float(sys.argv[2]))
  elif len(sys.argv) == 2:
    scraper(int(sys.argv[1]))
  else:
    scraper()

if __name__ == "__main__":
  main()

This scraper we wrote works for non-password protected websites. However, some websites we are getting our data from are password protected, and would require more work on our scraper, which we have just learnt and are not as familiar with. 

Thus, we changed our method of retrieving data to retrieving through APIs. 

## Attempt 2: APIs
We went to Etherscan, and collected a bunch of wallets that were tagged to CEXs in Etherscan, and other random user wallets. We then called some APIs, to retrieve predictors that we thought might be relevant, like Ether Balance, Transaction Count etc.

In [8]:
import numpy as np
import pandas as pd
import requests

In [1]:
API_KEY = "H7Y6I5GQ2RN5PJ8VSAYFNEX2HQDGMPWWIZ"

In [15]:
wallets = pd.read_csv('wallets.csv')
walletDF = pd.DataFrame(wallets)
walletDF

Unnamed: 0.1,Unnamed: 0,WID,Label,Eth Balance,Txn Count,Exchange,ERC 20 Txn,ERC 20 Txn In,ERC 20 Txn Out,ERC721 Txn Out,ERC721 Txn In,ERC721 Txn
0,0,0x4dc98c79a52968a6c20ce9a7a08d5e8d1c2d5605,,0.00477614 Ether,,Y,,,,,,
1,1,0xbb3fd383d1c5540e52ef0a7bcb9433375793aeaf,,5.25604866 Ether,,Y,,,,,,
2,2,0x1ccbdff6336b1027995a27a77b41fa87eb6608a3,,0.06097497 Ether,,Y,,,,,,
3,3,0x05f51aab068caa6ab7eeb672f88c180f67f17ec7,ABCC,0 Ether,,Y,,,,,,
4,4,0x2ddd202174a72514ed522e77972b461b03155525,Alcumex Exchange,0 Ether,,Y,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
839,839,0xfba17aca0797f72bd8152c5e519100c0c4caf0ad,,,,N,,,,,,
840,840,0xfbbe05954c6b138999548171c272a1b109d89471,,,,N,,,,,,
841,841,0xfefa8d5093ffd962a094fdba722a2c30d1c7a822,,,,N,,,,,,
842,842,0xff64a8933e05c9d585ab72db95d207ebee9be5a8,,,,N,,,,,,


The Ether balance for the non CEX wallets are all empty, so we have to call one of Etherscan's API to populate it.
We are using a timeout to implement a delay every 5 API calls, so we do not exceed our free limit.

In [18]:
import time
count = 0
for index, row in walletDF.iterrows():
    if pd.isna(row['Eth Balance']):
        count += 1
        baseURL = "https://api.etherscan.io/api?module=account&action=balance&address=" + row["WID"] + "&tag=latest&apikey=" + API_KEY
        response = requests.get(baseURL)
        balance = int (response.json()['result']) / 10**18
        walletDF.loc[index, 'Eth Balance'] = balance

        if count == 5:
            time.sleep(2)
            count = 0

walletDF

Unnamed: 0.1,Unnamed: 0,WID,Label,Eth Balance,Txn Count,Exchange,ERC 20 Txn,ERC 20 Txn In,ERC 20 Txn Out,ERC721 Txn Out,ERC721 Txn In,ERC721 Txn
0,0,0x4dc98c79a52968a6c20ce9a7a08d5e8d1c2d5605,,0.00477614 Ether,522.0,Y,,,,,,
1,1,0xbb3fd383d1c5540e52ef0a7bcb9433375793aeaf,,5.25604866 Ether,10000.0,Y,,,,,,
2,2,0x1ccbdff6336b1027995a27a77b41fa87eb6608a3,,0.06097497 Ether,7.0,Y,,,,,,
3,3,0x05f51aab068caa6ab7eeb672f88c180f67f17ec7,ABCC,0 Ether,10000.0,Y,,,,,,
4,4,0x2ddd202174a72514ed522e77972b461b03155525,Alcumex Exchange,0 Ether,709.0,Y,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
839,839,0xfba17aca0797f72bd8152c5e519100c0c4caf0ad,,0.031192,967.0,N,,,,,,
840,840,0xfbbe05954c6b138999548171c272a1b109d89471,,0.001795,255.0,N,,,,,,
841,841,0xfefa8d5093ffd962a094fdba722a2c30d1c7a822,,0.035719,32.0,N,,,,,,
842,842,0xff64a8933e05c9d585ab72db95d207ebee9be5a8,,30106.172274,22.0,N,,,,,,


Now, we need to get the Transaction Count for all our wallets.

In [None]:
newData = pd.read_csv('test.csv')
newDF = pd.DataFrame(newData)
count = 0
for index, row in newDF.iterrows():
    if pd.isna(row['Txn Count']):
        count += 1
        baseURL = "https://api.etherscan.io/api?module=account&action=txlist&address=" + str(row["WID"]) + "&startblock=0&endblock=99999999&sort=asc&apikey=" + API_KEY
        response = requests.get(baseURL)
        try:
            txnCount = len(response.json()['result'])
            row['Txn Count'] = txnCount
        
        except:
            print(row['WID'])
        if count == 5:
            time.sleep(2)
            count = 0

newDF

In [None]:
wallet = '0xE107a20CE3836329a3C310E9A2DA812CFB177F72'
baseURL = "https://api.etherscan.io/api?module=account&action=txlist&address=" + wallet + "&startblock=0&endblock=99999999&sort=asc&apikey=" + API_KEY
response = requests.get(baseURL)
txnCount = response.json()['result']
print(len(txnCount))

In [21]:
for index, row in walletDF.iterrows():
    if pd.isna(row['Txn Count']):
        
        wallet = row['WID']
        baseURL = "https://api.etherscan.io/api?module=account&action=txlist&address=" + wallet + "&startblock=0&endblock=99999999&sort=asc&apikey=" + API_KEY
        response = requests.get(baseURL)
        txnCount = response.json()['result']
        try:
            print(len(txnCount))
            walletDF.loc[index, 'Txn Count'] = len(txnCount)
        
        except:
            print("error with wallet:", wallet)

walletDF


10000
5928
4852
1562
1376
10000
7736
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
10000
32
435
10000
817
99
135
162
5
10000
10000
8352
9156
30
28
10000
10000
395
1088
7984
3832
10000
34
10000
10000
1344
10000
2053
10000
3843
284
10000
26
1
1
1
10000
10000
10
23
11
4
8
22
530
4961
110
4716
7631
10000
11
9
50
10000
10000
22
1275
881
1700
1315
1493
10000
10000
10000
10000
33
8010
10000
10000
10000
40
22
29
17
14
682
10000
10000
182
10000
10000
10000
10000
10000
10000
493
426
10000
6840
112
10000
10000
1552
10000
10000
7965
10000
10000
10000
10000
10000
10000
10000
10000
10000
257
10000
10000
10000
10000
3856
10000
10000
62
10000
10000
108
10000
10000
10000
10000
87
10000
10000
10000
10000
10000
10000
1118
9812
10000
475
10000
10000
10000
10000
10000
10000
10000
10000
2202
10000
10000
10000
10000
10000
10000
213
95
10000
10000
336
10000
13
175
97
983
10000
10000
10000
10000
10000
10000
9336
10000
10000
10000
10000
567
100

Unnamed: 0.1,Unnamed: 0,WID,Label,Eth Balance,Txn Count,Exchange,ERC 20 Txn,ERC 20 Txn In,ERC 20 Txn Out,ERC721 Txn Out,ERC721 Txn In,ERC721 Txn
0,0,0x4dc98c79a52968a6c20ce9a7a08d5e8d1c2d5605,,0.00477614 Ether,522.0,Y,,,,,,
1,1,0xbb3fd383d1c5540e52ef0a7bcb9433375793aeaf,,5.25604866 Ether,10000.0,Y,,,,,,
2,2,0x1ccbdff6336b1027995a27a77b41fa87eb6608a3,,0.06097497 Ether,7.0,Y,,,,,,
3,3,0x05f51aab068caa6ab7eeb672f88c180f67f17ec7,ABCC,0 Ether,10000.0,Y,,,,,,
4,4,0x2ddd202174a72514ed522e77972b461b03155525,Alcumex Exchange,0 Ether,709.0,Y,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
839,839,0xfba17aca0797f72bd8152c5e519100c0c4caf0ad,,0.031192,967.0,N,,,,,,
840,840,0xfbbe05954c6b138999548171c272a1b109d89471,,0.001795,255.0,N,,,,,,
841,841,0xfefa8d5093ffd962a094fdba722a2c30d1c7a822,,0.035719,32.0,N,,,,,,
842,842,0xff64a8933e05c9d585ab72db95d207ebee9be5a8,,30106.172274,22.0,N,,,,,,


Getting the ERC20 Transaction data.

In [22]:
for index, row in walletDF.iterrows():
    if pd.isna(row['ERC 20 Txn']):
        txnIn = 0
        txnOut = 0
        wallet = row['WID']
        baseURL = "https://api.etherscan.io/api?module=account&action=tokentx&address=" + wallet + "&startblock=0&endblock=99999999&sort=asc&apikey=" + API_KEY
        response = requests.get(baseURL)
        try:
            for txn in response.json()['result']:
                if txn['to'] == wallet:
                    txnIn += 1
                if txn['from'] == wallet:
                    txnOut += 1
            
            walletDF.loc[index, 'ERC 20 Txn In'] = txnIn
            walletDF.loc[index, 'ERC 20 Txn Out'] = txnOut
            walletDF.loc[index, 'ERC 20 Txn'] = txnIn + txnOut
        except:
            print("ERROR", wallet)
walletDF

ERROR 0x0557f90d7ab4bd350fb48eb07ffda16e7eb8bbab
ERROR 0x138bbd5bb806e6948a5de58a65fc6eeb17f94e39
ERROR 0x3d82740f83d909ae99c7e0a4ca47670eb35b8b93
ERROR 0x4267353801347d8e4e2c0b42b763dd279a919bb6
ERROR 0x44b3cde3bf40e8655a85dcc8748412f34c7ddf05
ERROR 0x4c337a27538acee6ef2ee3b78176bb8f3d77ffdd
ERROR 0x526b507569ca21fd7cfa440aa68fee5c7e4cf16e
ERROR 0x6ac41850480d96b9986f782b0f621a520bdc43dd
ERROR 0x7e70eb323cb0e3fcb91c4cc71abe45dbc980865c
ERROR 0x8fbe49da7aeadf2bf92e772f52b8449ff1b286eb
ERROR 0xa286e0c0fd25bbeadc028f1c073769bdd665a08a
ERROR 0xa7e15ef7c01b58ebe5ef74aa73625ae4b11fe754
ERROR 0xb37909514ac6fb59d5c67cae946ff0d7f6a44835
ERROR 0xd7eeec44ec2bc82985c4e6cb86838c6e4453ccb4
ERROR 0xf16b1cae2b26234e943848f98c2e02a2d479dcea


Unnamed: 0.1,Unnamed: 0,WID,Label,Eth Balance,Txn Count,Exchange,ERC 20 Txn,ERC 20 Txn In,ERC 20 Txn Out,ERC721 Txn Out,ERC721 Txn In,ERC721 Txn
0,0,0x4dc98c79a52968a6c20ce9a7a08d5e8d1c2d5605,,0.00477614 Ether,522.0,Y,636.0,242.0,394.0,,,
1,1,0xbb3fd383d1c5540e52ef0a7bcb9433375793aeaf,,5.25604866 Ether,10000.0,Y,10000.0,7902.0,2098.0,,,
2,2,0x1ccbdff6336b1027995a27a77b41fa87eb6608a3,,0.06097497 Ether,7.0,Y,3.0,1.0,2.0,,,
3,3,0x05f51aab068caa6ab7eeb672f88c180f67f17ec7,ABCC,0 Ether,10000.0,Y,10001.0,4287.0,5714.0,,,
4,4,0x2ddd202174a72514ed522e77972b461b03155525,Alcumex Exchange,0 Ether,709.0,Y,626.0,35.0,591.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
839,839,0xfba17aca0797f72bd8152c5e519100c0c4caf0ad,,0.031192,967.0,N,446.0,227.0,219.0,,,
840,840,0xfbbe05954c6b138999548171c272a1b109d89471,,0.001795,255.0,N,24.0,15.0,9.0,,,
841,841,0xfefa8d5093ffd962a094fdba722a2c30d1c7a822,,0.035719,32.0,N,0.0,0.0,0.0,,,
842,842,0xff64a8933e05c9d585ab72db95d207ebee9be5a8,,30106.172274,22.0,N,104.0,104.0,0.0,,,


Getting the ERC721 Transaction data.

In [24]:
for index, row in walletDF.iterrows():
    
    if pd.isna(row['ERC721 Txn']):
        txnout = 0
        txnin = 0
        wallet = row['WID']
        baseURL = "https://api.etherscan.io/api?module=account&action=tokennfttx&address="+ wallet + "&startblock=0&endblock=99999999&sort=asc&apikey=" + API_KEY
        response = requests.get(baseURL)
        try: 
            for txn in response.json()['result']:
                if txn['from'] == wallet:
                    txnout += 1
                if txn['to'] == wallet:
                    txnin += 1
            walletDF.loc[index, 'ERC721Txn Out'] = txnout
            walletDF.loc[index, 'ERC721Txn In'] = txnin
            walletDF.loc[index, 'ERC721Txn'] = txnout + txnin
        
        except:
            print("Error", row['WID'])
walletDF        

Error 0xaecbe94703df39b49ac440feb177c7f1f782c064
Error 0xf0c80fb9fb22bef8269cb6feb9a51130288a671f
Error 0x4df5f3610e2471095a130d7d934d551f3dde01ed
Error 0x92dbd8e0a46edd62aa42d1f7902d0e496bddc15a
Error 0x0bb5de248dbbd31ee6c402c3c4a70293024acf74
Error 0xed5cdb0d02152046e6f234ad578613831b9184d4
Error 0x85b931a32a0725be14285b66f1a22178c672d69b
Error 0x708396f17127c42383e3b9014072679b2f60b82f
Error 0x56eddb7aa87536c09ccc2793473599fd21a8b17f
Error 0x9696f59e4d72e237be84ffd425dcad154bf96976
Error 0x345d8e3a1f62ee6b1d483890976fd66168e390f2
Error 0xc3c8e0a39769e2308869f7461364ca48155d1d9e
Error 0x892e9e24aea3f27f4c6e9360e312cce93cc98ebe
Error 0xfe9e8709d3215310075d67e3ed32a380ccf451c8
Error 0x4e9ce36e442e55ecd9025b9a6e0d88485d628a67
Error 0x7a91a362d4f2c9c4627688d5b7090bbb12e5715f
Error 0x28ebe764b8f9a853509840645216d3c2c0fd774b
Error 0x0b73f67a49273fc4b9a65dbd25d7d0918e734e63
Error 0x482f02e8bc15b5eabc52c6497b425b3ca3c821e8
Error 0x30a2ebf10f34c6c4874b0bdd5740690fd2f3b70c
Error 0x3f7e77b62767

Unnamed: 0.1,Unnamed: 0,WID,Label,Eth Balance,Txn Count,Exchange,ERC 20 Txn,ERC 20 Txn In,ERC 20 Txn Out,ERC721 Txn Out,ERC721 Txn In,ERC721 Txn,ERC721Txn Out,ERC721Txn In,ERC721Txn
0,0,0x4dc98c79a52968a6c20ce9a7a08d5e8d1c2d5605,,0.00477614 Ether,522.0,Y,636.0,242.0,394.0,,,,0.0,0.0,0.0
1,1,0xbb3fd383d1c5540e52ef0a7bcb9433375793aeaf,,5.25604866 Ether,10000.0,Y,10000.0,7902.0,2098.0,,,,0.0,0.0,0.0
2,2,0x1ccbdff6336b1027995a27a77b41fa87eb6608a3,,0.06097497 Ether,7.0,Y,3.0,1.0,2.0,,,,0.0,0.0,0.0
3,3,0x05f51aab068caa6ab7eeb672f88c180f67f17ec7,ABCC,0 Ether,10000.0,Y,10001.0,4287.0,5714.0,,,,0.0,0.0,0.0
4,4,0x2ddd202174a72514ed522e77972b461b03155525,Alcumex Exchange,0 Ether,709.0,Y,626.0,35.0,591.0,,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,839,0xfba17aca0797f72bd8152c5e519100c0c4caf0ad,,0.031192,967.0,N,446.0,227.0,219.0,,,,,,
840,840,0xfbbe05954c6b138999548171c272a1b109d89471,,0.001795,255.0,N,24.0,15.0,9.0,,,,,,
841,841,0xfefa8d5093ffd962a094fdba722a2c30d1c7a822,,0.035719,32.0,N,0.0,0.0,0.0,,,,0.0,4.0,4.0
842,842,0xff64a8933e05c9d585ab72db95d207ebee9be5a8,,30106.172274,22.0,N,104.0,104.0,0.0,,,,0.0,0.0,0.0


In [None]:
walletDF.to_csv('data.csv')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=696215f1-387b-461d-aa4c-0ccf546cb4fd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>