In [None]:
import pandas as pd
import requests
import datetime
api_key = 'API-KEY'

url = "https://api.etherscan.io/api"
from TDD import TDD

### **Functions**

In [1]:
def first_last_info(address):

    params = {
        "module": "account",
        "action": "txlist",
        "address": address,
        "page": 1,
        "offset": 100,
        "startblock": 0,
        "endblock": 99999999,
        "sort": "asc",
        "apikey": api_key
    }


    response = requests.get(url, params=params).json()
    response = response['result']
    first_date = response[0]['timeStamp']
    last_date = response[-1]['timeStamp']

    first_to = response[0]['to']
    first_from = response[0]['from']

    last_to = response[-1]['to']
    last_from = response[-1]['from']

    if first_to == address:
        first_to = 'self'
    if first_from == address:
        first_from = 'self'
    if last_to == address:
        last_to = 'self'
    if last_from == address:
        last_from = 'self'

    return [address,first_date,last_date,first_from,first_to,last_from,last_to]

In [None]:
def get_transaction_history(address):

    params = {
        "module": "account",
        "action": "txlist",
        "address": address,
        "page": 1,
        "offset": 100,
        "startblock": 0,
        "endblock": 99999999,
        "sort": "asc",
        "apikey": api_key
    }


    response = requests.get(url, params=params).json()
    return response['result']

def get_Erc20_transaction_history(address):

    params = {
        "module": "account",
        "action": "tokentx",
        "address": address,
        "page": 1,
        "offset": 100,
        "startblock": 0,
        "endblock": 99999999,
        "sort": "asc",
        "apikey": api_key
    }


    response = requests.get(url, params=params).json()
    return response['result']

def get_wallet_age(history: list[dict]):
    if len(history) > 0:
        creation_time = int(history[0]['timeStamp'])
        creation_date = datetime.datetime.fromtimestamp(creation_time).date()
        current_date = datetime.date.today()
        wallet_age = (current_date - creation_date).days
        return wallet_age
    else:
        return 0

def to_and_from(history:list[dict],address):
    from_count = 0
    to_count = 0
    for transactions in history:
        if transactions['from'] == address:
            from_count +=1
        else:
            to_count += 1
    return from_count, to_count

In [None]:
def fetch(address,nested_list):

    reg_hist = get_transaction_history(address)
    erc20_hist = get_Erc20_transaction_history(address)

    reg_age = get_wallet_age(reg_hist)
    erc_age = get_wallet_age(erc20_hist)

    reg_to,reg_from = to_and_from(reg_hist,address)
    erc_to,erc_from = to_and_from(erc20_hist,address)

    tdd,txn_count = TDD(address)

    row = [address,txn_count,reg_age,erc_age,reg_to,reg_from,erc_to,erc_from,tdd]
    nested_list.append(row)

## **Alpha ETH**

In [None]:
Alpha_Eth = pd.read_csv('voters/alpha_eth.csv')
Infra_sybils = pd.read_excel('research_data/alpha_round_sybils.xlsx', sheet_name='Eth Infra')
Alpha_Eth.dropna(subset=['supporterwallet'],inplace=True)

In [None]:
known_sybils = Infra_sybils.loc[(Infra_sybils['Suspected sybil?']==True)&(Infra_sybils['Reviewed Manually']==True)].copy()
known_sybils.rename(columns={'Source Address': 'supporterwallet'},inplace=True)

merged = pd.merge(left=known_sybils,right = Alpha_Eth,how='left',on='supporterwallet')
merged.drop(['ID', 'Token', 'Amount','Etherscan','projectid','txnhash','communityid'], axis=1, inplace=True)

In [None]:
address_counts = merged['supporterwallet'].value_counts()
Address_info_sybil = pd.DataFrame({'Address': address_counts.index, 'Count': address_counts.values})

In [None]:
funding_counts = Alpha_Eth['supporterwallet'].value_counts()
count_by_address_project = Alpha_Eth.groupby(['supporterwallet', 'project_title']).size().reset_index(name='count')
no_grants_funded = count_by_address_project['supporterwallet'].value_counts()

# create a new dataframe with the address counts as a column
Address_info1 = pd.DataFrame({'supporterwallet': funding_counts.index, 'Funding_count': funding_counts.values})
Address_info2 = pd.DataFrame({'supporterwallet': no_grants_funded.index, 'No_Grants_Funded': no_grants_funded.values})
Address_info_eth = pd.merge(Address_info1,Address_info2)

Address_info_eth.shape[0] == len(Alpha_Eth['supporterwallet'].unique())

True

In [None]:
Address_info_eth.describe()

Unnamed: 0,Funding_count,No_Grants_Funded
count,4777.0,4777.0
mean,4.382248,4.244714
std,5.236653,5.001276
min,1.0,1.0
25%,1.0,1.0
50%,2.0,2.0
75%,5.0,5.0
max,44.0,22.0


In [None]:
Address_info_eth_filtered = Address_info_eth.loc[(Address_info_eth['No_Grants_Funded']<=5)&(Address_info_eth['Funding_count']<=5)].reset_index(drop=True)

In [None]:
filtered = pd.merge(Address_info_eth_filtered,Alpha_Eth,how='left',on='supporterwallet')
filtered = filtered.drop(['projectid','txnhash','exchangerate','updatedat'],axis=1)
data_points = ['supporterwallet','Funding_count', 'No_Grants_Funded','token', 'amount', 'createdat','passportscore','passportpass']

### Querying_eth_infra

In [None]:
# headers = ['supporterwallet','txn_count','Wallet_Age','Wallet_Age(Erc20)','to_count','from_count','erc_to','erc_from','TDD']
# contents = []

# count = 0
# for i in merged['supporterwallet'].unique():
#     print(count)
#     fetch(i,contents)
#     count+=1

#sybil_alpha_sybil_data = pd.DataFrame(contents,columns=headers)
#sybil_alpha_sybil_data.to_csv('queried_data/eth_infra_sybil_data.csv',index=False)

In [None]:
# headers = ['supporterwallet','first_date','last_date','first_from','first_to','last_from','last_to']
# contents = []

# count = 0
# for i in eth_infra_q['supporterwallet']:
#     print(count)
#     info = first_last_info(i)
#     contents.append(info)
#     count+=1

# data = pd.DataFrame(contents,columns=headers)

In [None]:
# count = 0
# for i in filtered['supporterwallet'].unique():
#     print(count)
#     fetch(i,contents)
#     count+=1

# alpha_sybil_data = pd.DataFrame(contents,columns=headers)

# eth_infra_q = pd.merge(data,alpha_sybil_data,on='supporterwallet')

# alpha_sybil_data.to_csv('queried_data/eth_infra_data.csv',index=False)

In [None]:
eth_infra_q = pd.read_csv('queried_data/eth_infra_data.csv')

In [None]:
filtered['address'] = filtered['supporterwallet']

filtered['project_title_sorted'] = filtered['project_title'].apply(lambda x: '-'.join(sorted(x.lower().split())))

# group the rows by the address value and apply aggregation functions to the columns
df_result = filtered.groupby('address').agg({'supporterwallet': 'first',
                                       'project_title_sorted': '_'.join}).reset_index()

# sort the resulting DataFrame by the count of project titles in descending order
df_result = df_result.sort_values(by='project_title_sorted', ascending=False)[['supporterwallet','project_title_sorted']]

cut_filtered = filtered[data_points].drop_duplicates(subset=['supporterwallet'])

cultivate_data_infra = pd.merge(cut_filtered,df_result,on='supporterwallet',how='left')

## **Beta Round**

In [None]:
beta = pd.read_csv('voters/beta_round_votes.csv')

In [None]:
beta.rename(columns={'voter':'supporterwallet','grantAddress':'project_addresses'},inplace=True)

In [None]:
funding_counts = beta['supporterwallet'].value_counts()
count_by_address_project = beta.groupby(['supporterwallet', 'project_addresses']).size().reset_index(name='count')
no_grants_funded = count_by_address_project['supporterwallet'].value_counts()

# create a new dataframe with the address counts as a column
Address_info_beta_1 = pd.DataFrame({'supporterwallet': funding_counts.index, 'Funding_count': funding_counts.values})
Address_info_beta_2 = pd.DataFrame({'supporterwallet': no_grants_funded.index, 'No_Grants_Funded': no_grants_funded.values})
Address_info_beta = pd.merge(Address_info_beta_1,Address_info_beta_2)

Address_info_beta.shape[0] == len(beta['supporterwallet'].unique())

True

In [None]:
Address_info_beta_filtered = Address_info_beta.loc[(Address_info_beta['No_Grants_Funded']<=5)&(Address_info_beta['Funding_count']<=5)].reset_index(drop=True)
# formerly <=5 and <10

In [None]:
filtered_beta = pd.merge(Address_info_beta_filtered,beta,how='left',on='supporterwallet')
filtered_beta = filtered_beta.drop(['id','transaction', 'blockNumber','projectId', 'applicationId', 'roundId','amountUSD','amountRoundToken'],axis=1)

In [None]:
filtered_beta

Unnamed: 0,supporterwallet,Funding_count,No_Grants_Funded,project_addresses,token,amount,createdAt
0,0x6fA59B3D434Bf77709449179cE657230f299B684,5,5,0xe126b3E5d052f1F575828f61fEBA4f4f2603652a,0x6B175474E89094C44Da98b954EedeAC495271d0F,2000000000000000000,2023-05-09 01:46:47
1,0x6fA59B3D434Bf77709449179cE657230f299B684,5,5,0x18aa467E40E1deFB1956708830A343c1D01d3D7C,0x6B175474E89094C44Da98b954EedeAC495271d0F,2000000000000000000,2023-05-09 01:46:47
2,0x6fA59B3D434Bf77709449179cE657230f299B684,5,5,0x99b36fDbC582D113aF36A21EBa06BFEAb7b9bE12,0x6B175474E89094C44Da98b954EedeAC495271d0F,2000000000000000000,2023-05-09 01:46:47
3,0x6fA59B3D434Bf77709449179cE657230f299B684,5,5,0x5683921269B041fbE66882dd6CADd48B7e5b5817,0x6B175474E89094C44Da98b954EedeAC495271d0F,2000000000000000000,2023-05-09 01:46:47
4,0x6fA59B3D434Bf77709449179cE657230f299B684,5,5,0x08a3c2A819E3de7ACa384c798269B3Ce1CD0e437,0x6B175474E89094C44Da98b954EedeAC495271d0F,2000000000000000000,2023-05-09 01:46:47
...,...,...,...,...,...,...,...
26215,0xb2F4E2c5a7b9BBDD2C86642cd91bcdAe8A14A9E0,1,1,0x3A5bd1E37b099aE3386D13947b6a90d97675e5e3,0x0000000000000000000000000000000000000000,18144745550000000,2023-05-03 19:37:23
26216,0x36dbB4D3644a7008B0fE6D48F2f88dAb2c28a12E,1,1,0x3A5bd1E37b099aE3386D13947b6a90d97675e5e3,0x0000000000000000000000000000000000000000,18147700000000000,2023-05-03 19:37:23
26217,0x713ACc1b8F040509593B8A64fA45714FF2bA4100,1,1,0x3A5bd1E37b099aE3386D13947b6a90d97675e5e3,0x0000000000000000000000000000000000000000,18110000000000000,2023-05-03 19:37:23
26218,0xB675695293e7CFAd87bF2c98ae5c6b05EB98189e,1,1,0x3A5bd1E37b099aE3386D13947b6a90d97675e5e3,0x0000000000000000000000000000000000000000,17145450000000000,2023-05-03 19:37:23


In [None]:
len(filtered_beta.supporterwallet.unique())

11253

### Querying beta

In [None]:
# headers = ['supporterwallet','first_date','last_date','first_from','first_to','last_from','last_to']
# contents = []

# count = 0
# for i in filtered_beta['supporterwallet'].unique():
#     print(count)
#     info = first_last_info(i)
#     contents.append(info)
#     count+=1
# data = pd.DataFrame(contents,columns=headers)

In [None]:
# headers = ['supporterwallet','txn_count','Wallet_Age','Wallet_Age(Erc20)','to_count','from_count','erc_to','erc_from','TDD']
# contents = []

# count = 1000
# for i in filtered_beta['supporterwallet'].unique()[1000:]:
#     print(count)
#     fetch(i,contents)
#     count+=1


# beta_data = pd.DataFrame(contents,columns=headers)
# df = pd.merge(data,beta_data,on='supporterwallet',how='right')
# df.to_csv('queried_data/beta_data.csv',index=False)

In [None]:
beta_data = pd.read_csv('queried_data/beta_data.csv')

In [None]:
beta_data

Unnamed: 0,supporterwallet,first_date,last_date,first_from,first_to,last_from,last_to,txn_count,Wallet_Age,Wallet_Age(Erc20),to_count,from_count,erc_to,erc_from,TDD
0,0x73f432dA307525B6590c858bf458116322D50d16,1616995739,1627382441,0x6c03d2e5340c256838cf8b96e05e23a1a911fe0f,0x73f432da307525b6590c858bf458116322d50d16,0x73f432da307525b6590c858bf458116322d50d16,0xd9e1ce17f2641f24ae83637ab66a2cca9c378b9f,418,766,727,0,100,0,100,0.387560
1,0x8F59D415FFd78575d9C674d32aC114a41aFd3D30,1655442455,1661061630,0x8f59d415ffd78575d9c674d32ac114a41afd3d30,0xd945f759d422ae30a6166838317b937de08380e3,0x8f59d415ffd78575d9c674d32ac114a41afd3d30,0xb6728bff344e2109dbb3a0c153cad65ec4bdb736,481,321,319,0,100,0,100,0.222453
2,0xb63DF1cCb8f816426AccEaEBB6B18518b296CaD9,1629658843,1680359255,0xdfd5293d8e347dfe59e90efd55b2956a1343963d,0xb63df1ccb8f816426acceaebb6b18518b296cad9,0xb63df1ccb8f816426acceaebb6b18518b296cad9,0x2bd938cf96430b7b0879f76b010b589aeec2127c,80,620,392,0,100,0,16,4.487500
3,0xadc1d13ACf57EA12cdF14679177Fd669221ADF2c,1604104915,1634061330,0x4508f5e7d98582cd202f212090cefe542a29f9dd,0xadc1d13acf57ea12cdf14679177fd669221adf2c,0xadc1d13acf57ea12cdf14679177fd669221adf2c,0x967da4048cd07ab37855c090aaf366e4ce1b9f48,290,915,916,0,100,0,100,0.703448
4,0x3a4974C33f81bf1bBB7aA6C6Bcf1b0a616C86F30,1636936663,1669075499,0x21a31ee1afc51d94c2efccaa2092ad1028285549,0x3a4974c33f81bf1bbb7aa6c6bcf1b0a616c86f30,0x3a4974c33f81bf1bbb7aa6c6bcf1b0a616c86f30,0x495f947276749ce646f68ac8c248420045cb7b5e,230,535,551,0,100,0,84,0.369565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12753,0xd5b43B05D998dA3b2179c51EC42fF8E5860047B3,1519565006,1683535547,0x37f7ff1864f5d5cba2b63592656049f2d7d06fb4,0xd5b43b05d998da3b2179c51ec42ff8e5860047b3,0xd5b43b05d998da3b2179c51ec42ff8e5860047b3,0x12bb5bbbfe596dbc489d209299b8302c3300fa40,43,1903,1903,0,53,0,53,43.558140
12754,0x0af06723a312b8343E4D8EB7252B386fc977118B,1679242043,1683535643,0x4976a4a02f38326660d17bf34b431dc6e2eb2327,0x0af06723a312b8343e4d8eb7252b386fc977118b,0x0af06723a312b8343e4d8eb7252b386fc977118b,0x12bb5bbbfe596dbc489d209299b8302c3300fa40,7,55,54,0,13,0,3,6.714286
12755,0x29A44fAd7B272a69237446A041A287e94fFaAE0D,1679242139,1683535679,0x28c6c06298d514db089934071355e5743bf21d60,0x29a44fad7b272a69237446a041a287e94ffaae0d,0x29a44fad7b272a69237446a041a287e94ffaae0d,0x12bb5bbbfe596dbc489d209299b8302c3300fa40,7,55,54,0,14,0,3,6.571429
12756,0xAae12BEF2916C851b3925274879269917171Fa53,1666638275,1683535679,0xc098b2a3aa256d2140208c3de6543aaef5cd3a94,0xaae12bef2916c851b3925274879269917171fa53,0xaae12bef2916c851b3925274879269917171fa53,0x12bb5bbbfe596dbc489d209299b8302c3300fa40,81,201,187,0,96,0,15,2.259259


In [None]:
beta_points =['supporterwallet','Funding_count','No_Grants_Funded']

In [None]:
filtered_beta['address'] = filtered_beta['supporterwallet']

filtered_beta['project_title_sorted'] = filtered_beta['project_addresses'].apply(lambda x: '-'.join(sorted(x.lower().split())))

# group the rows by the address value and apply aggregation functions to the columns
df_result = filtered_beta.groupby('address').agg({'supporterwallet': 'first',
                                       'project_title_sorted': '_'.join}).reset_index()

# sort the resulting DataFrame by the count of project titles in descending order
df_result = df_result.sort_values(by='project_title_sorted', ascending=False)[['supporterwallet','project_title_sorted']]

cut_filtered_beta = filtered_beta[beta_points].drop_duplicates(subset=['supporterwallet'])

cultivate_data_beta = pd.merge(cut_filtered_beta,df_result,on='supporterwallet',how='left')

In [None]:
cultivate_data_beta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11253 entries, 0 to 11252
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   supporterwallet       11253 non-null  object
 1   Funding_count         11253 non-null  int64 
 2   No_Grants_Funded      11253 non-null  int64 
 3   project_title_sorted  11253 non-null  object
dtypes: int64(2), object(2)
memory usage: 439.6+ KB


## **Grouping**

In [None]:
stats_data_infra = pd.merge(cultivate_data_infra,eth_infra_q,on='supporterwallet')
stats_data_beta = pd.merge(cultivate_data_beta,beta_data,on='supporterwallet')
stats_data_beta.drop_duplicates(subset='supporterwallet',inplace=True)

In [None]:
stats_data_beta.isnull().sum()

supporterwallet         0
Funding_count           0
No_Grants_Funded        0
project_title_sorted    0
first_date              0
last_date               0
first_from              0
first_to                0
last_from               0
last_to                 0
txn_count               0
Wallet_Age              0
Wallet_Age(Erc20)       0
to_count                0
from_count              0
erc_to                  0
erc_from                0
TDD                     0
dtype: int64

In [None]:
stats_data_infra = stats_data_infra.reindex(sorted(stats_data_infra.columns), axis=1)
stats_data_beta = stats_data_beta.reindex(sorted(stats_data_beta.columns), axis=1)

In [None]:
stats_data_beta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11253 entries, 0 to 11261
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Funding_count         11253 non-null  int64  
 1   No_Grants_Funded      11253 non-null  int64  
 2   TDD                   11253 non-null  float64
 3   Wallet_Age            11253 non-null  int64  
 4   Wallet_Age(Erc20)     11253 non-null  int64  
 5   erc_from              11253 non-null  int64  
 6   erc_to                11253 non-null  int64  
 7   first_date            11253 non-null  int64  
 8   first_from            11253 non-null  object 
 9   first_to              11253 non-null  object 
 10  from_count            11253 non-null  int64  
 11  last_date             11253 non-null  int64  
 12  last_from             11253 non-null  object 
 13  last_to               11253 non-null  object 
 14  project_title_sorted  11253 non-null  object 
 15  supporterwallet    

In [None]:
stats_data_beta.fillna(0.0,inplace=True)
stats_data_infra.fillna(0.0,inplace=True)

In [None]:
stats_data_i = stats_data_infra.copy()
stats_data_b = stats_data_beta.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

columns_to_encode = ['token', 'project_title_sorted', 'first_from', 'first_to', 'last_from', 'last_to']

for col in columns_to_encode:
    le = LabelEncoder()
    stats_data_i[col] = le.fit_transform(stats_data_i[col])

columns_to_encode = ['project_title_sorted', 'first_from', 'first_to', 'last_from', 'last_to']

for col in columns_to_encode:
    le = LabelEncoder()
    stats_data_b[col] = le.fit_transform(stats_data_b[col])
    print(col)

project_title_sorted
first_from
first_to
last_from
last_to


In [None]:
feature_infra = ['Funding_count', 'No_Grants_Funded', 'token','amount', 'passportscore', 'passportpass',
            'project_title_sorted','first_from','first_to', 'last_from', 'last_to', 'txn_count', 'Wallet_Age',
            'Wallet_Age(Erc20)', 'to_count', 'from_count', 'erc_to', 'erc_from']

###  Eth Infra

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

data = np.array(stats_data_i[feature_infra])

# Add supporterwallet column to data
supporterwallet_col = stats_data_i['supporterwallet'].values.reshape(-1, 1)
data = np.hstack((supporterwallet_col, data))

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(data[:, 1:])

# Set threshold for grouping together similar rows
threshold = 0.9999

# Initialize list to store similar supporter wallets
similar_supporters_infra = []

# Loop through similarity matrix and group together similar supporter wallets
for i in range(len(similarity_matrix)):
    similar_supporter_indices = np.where(similarity_matrix[i] >= threshold)[0]
    if len(similar_supporter_indices) > 1:
        similar_supporter_wallets = [data[j][0] for j in similar_supporter_indices]
        for supporter in similar_supporter_wallets:
            if supporter not in similar_supporters_infra:
                similar_supporters_infra.append(supporter)

In [None]:
data = stats_data_i[feature_infra].astype(float)

# Add supporterwallet column to data
supporterwallet_col = stats_data_i['supporterwallet'].values.reshape(-1, 1)
data = np.hstack((supporterwallet_col, data))

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(data[:, 1:])

# Set threshold for grouping together similar rows
threshold = 0.9999

# Initialize list to store similar rows
similar_rows = []

# Loop through similarity matrix and group together similar rows
for i in range(len(similarity_matrix)):
    similar_row_indices = np.where(similarity_matrix[i] >= threshold)[0]
    if len(similar_row_indices) > 1:
        similar_row_values = [tuple(data[j]) for j in similar_row_indices]
        if similar_row_values not in similar_rows:
            similar_rows.append(similar_row_values)

# Print out the similar rows
for i, row_group in enumerate(similar_rows):
    print(f"Similar Row Group {i}:")
    for row in row_group:
        print(row[0], row[1:])
    print()


Similar Row Group 0:
0x992129ad0e5c642f4d088ba6b51da823588b09e1 (4.0, 4.0, 1.0, 0.0019, 22.24, 1.0, 310.0, 1221.0, 90.0, 151.0, 493.0, 29.0, 160.0, 158.0, 29.0, 8.0, 1.0, 2.0)
0x8be31a599d12b6bf92a21adc211b0065505915af (4.0, 4.0, 1.0, 0.0019407, 22.22, 1.0, 310.0, 1221.0, 90.0, 151.0, 493.0, 33.0, 160.0, 158.0, 33.0, 9.0, 1.0, 2.0)

Similar Row Group 1:
0xfbbd5070addc38f53f95bc5259c2d0f9e90a92eb (3.0, 3.0, 1.0, 0.00065, 35.69, 1.0, 437.0, 1382.0, 43.0, 149.0, 1260.0, 46.0, 675.0, 704.0, 46.0, 26.0, 3.0, 5.0)
0xfa6b1a941c8f9515ab2122b47b4c806efb13b7b9 (3.0, 3.0, 1.0, 0.00065, 35.69, 1.0, 437.0, 1382.0, 43.0, 149.0, 1260.0, 50.0, 675.0, 704.0, 50.0, 26.0, 3.0, 6.0)
0xa879b4fef26dfd1cd35843ddd6071f2bc63ebfb7 (3.0, 3.0, 1.0, 0.00065, 35.69, 1.0, 437.0, 1382.0, 43.0, 149.0, 1260.0, 45.0, 675.0, 704.0, 45.0, 25.0, 3.0, 6.0)
0xb03ae245b2bb3526fdb511954b3775f11db12458 (3.0, 3.0, 1.0, 0.00065, 30.71, 1.0, 437.0, 1382.0, 43.0, 149.0, 1260.0, 44.0, 675.0, 704.0, 44.0, 26.0, 3.0, 7.0)

Similar Row

In [None]:
similar_supporters_infra

['0x992129ad0e5c642f4d088ba6b51da823588b09e1',
 '0x8be31a599d12b6bf92a21adc211b0065505915af',
 '0xfbbd5070addc38f53f95bc5259c2d0f9e90a92eb',
 '0xfa6b1a941c8f9515ab2122b47b4c806efb13b7b9',
 '0xa879b4fef26dfd1cd35843ddd6071f2bc63ebfb7',
 '0xb03ae245b2bb3526fdb511954b3775f11db12458',
 '0xfdee0c33f3493a1b83cc63bb5fbfa766572fbc03',
 '0xc900edd003feea41c0efb57262ba3d70608e7e22',
 '0xfc888fadd4063cb98041a59a232183a6d4306da2',
 '0xe6a1ff1cdb17130212e8a1ff596b8beadf112735',
 '0xcbc0012dd50f9e3920d55ea25aa6a22104c1c9ca',
 '0x90b5c0a0f414c7824d54802c5ab0c9fb10fb82d8',
 '0x508d4589ba566f94edb89a4ab15466ee128dffc3',
 '0x7e646d9f5ee2947c66e87aa414f8a5e7b26dd380',
 '0x5f0eb11a1232a771fcb91f7f796a5ee4cb217217',
 '0x68c882487920d8c114b9f80c4bbf882f60befc5b',
 '0xa2b1c8798cac94cab5a7b493a473979a724e1a1c',
 '0x2e884770f9e11ba6b649039e34f7504ada6ad8bb',
 '0x29d6f812e973dcbbc2e28d3c24c4baf371afd893',
 '0xc6d0b227ffcb0c7bb7eef6cde19e173a6b3ea7d5',
 '0xbdbe9e680d06fa764976d7ef2eee8cece745051d',
 '0x4e088ee46

In [None]:
infra_identified_sybils = pd.DataFrame(similar_supporters_infra,columns=['address'])
infra_identified_sybils.to_csv('identified_sybils/eth_infra.csv',index=False)

### Beta

In [None]:
feature_beta = ['Funding_count', 'No_Grants_Funded', 'TDD', 'Wallet_Age',
       'Wallet_Age(Erc20)', 'erc_from', 'erc_to',
       'first_to', 'from_count',
       'project_title_sorted', 'to_count', 'txn_count']

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

data = np.array(stats_data_b[feature_beta])

# Add supporterwallet column to data
supporterwallet_col = stats_data_b['supporterwallet'].values.reshape(-1, 1)
data = np.hstack((supporterwallet_col, data))

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(data[:, 1:])

# Set threshold for grouping together similar rows
threshold  = 0.9999999

# Initialize list to store similar supporter wallets
similar_supporters_beta = []

# Loop through similarity matrix and group together similar supporter wallets
for i in range(len(similarity_matrix)):
    similar_supporter_indices = np.where(similarity_matrix[i] >= threshold)[0]
    if len(similar_supporter_indices) > 1:
        similar_supporter_wallets = [data[j][0] for j in similar_supporter_indices]
        for supporter in similar_supporter_wallets:
            if supporter not in similar_supporters_beta:
                similar_supporters_beta.append(supporter)

In [None]:
similar_supporters_beta

['0xf90Eb69ce07a40F4542E87597F9DEDB82ddC5e19',
 '0xF86ed52D5403eF8bf2AaC52d90Ad34BEECe24013',
 '0x4A692123F3713C89D34aA8a5ad926CE54C523377',
 '0x04D6e709A9eB0e7c04D515Fa73f37F2ef873E502',
 '0xd8c2B741cb31e5352dC32fb3F0819D03A2c4c01c',
 '0xD8Fd11d65f3f9805e40526892f3034C75a2580D3',
 '0xc098b08b4D7dC5deea18Bc917ab2d5aE542910D0',
 '0xBFbd5547568D9F31Fd4E6A1Dd0fce342255D4210',
 '0xFF7254BD5af2e04C7Cb12E4399e8D7cd1eB66491',
 '0xFEEDFA7135BFB755875f9665F4f934Bf3dF21dD5',
 '0xE50274A0fE6A3466bBCE6B85012ac30C22B02632',
 '0xcaB97B599D25a84F9a0AFd85751Ed387f18380fF',
 '0xb62462465E2697F2275F668E775710Fa24fC9394',
 '0x633D4Cd6bB4d0c227792537Ad006705A4191cE02',
 '0x7BE839eA6214F231f0Bb893EE6dC6b62506f7f6c',
 '0x9234938dB2d9bD6D944d27f4F7b17187c81a5d7A',
 '0x00Bf528C0e7Ce508E6c20642FDAE6B8728E4122b',
 '0xCa5ED8B92856bF76e025d31A9439d3eC1f1F644c',
 '0xa73B1c9e679A973Ac944D451Eb497A7448e9CA22',
 '0xb8ecE7d8E375056a1429d9109DaeF89384F7387a',
 '0xB33F3Cbf89ca501CdD903bFcEFeee6B66d7A6C42',
 '0x300C018D3

In [None]:
data = stats_data_b[feature_beta]

# Add supporterwallet column to data
supporterwallet_col = stats_data_b['supporterwallet'].values.reshape(-1, 1)
data = np.hstack((supporterwallet_col, data))

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(data[:, 1:])

# Set threshold for grouping together similar rows
threshold = 0.9999999

# Initialize list to store similar rows
similar_rows = []

# Loop through similarity matrix and group together similar rows
for i in range(len(similarity_matrix)):
    similar_row_indices = np.where(similarity_matrix[i] >= threshold)[0]
    if len(similar_row_indices) > 1:
        similar_row_values = [tuple(data[j]) for j in similar_row_indices]
        if similar_row_values not in similar_rows:
            similar_rows.append(similar_row_values)

# Print out the similar rows
for i, row_group in enumerate(similar_rows):
    print(f"Similar Row Group {i}:")
    for row in row_group:
        print(row[0], row[1:])
    print()


Similar Row Group 0:
0xf90Eb69ce07a40F4542E87597F9DEDB82ddC5e19 (5.0, 5.0, 16.214285714285715, 234.0, 0.0, 0.0, 0.0, 10529.0, 19.0, 1291.0, 0.0, 14.0)
0xF86ed52D5403eF8bf2AaC52d90Ad34BEECe24013 (5.0, 5.0, 16.214285714285715, 234.0, 0.0, 0.0, 0.0, 10502.0, 19.0, 1291.0, 0.0, 14.0)

Similar Row Group 1:
0x4A692123F3713C89D34aA8a5ad926CE54C523377 (5.0, 5.0, 2.3902439024390243, 113.0, 0.0, 0.0, 0.0, 3263.0, 45.0, 3329.0, 0.0, 41.0)
0x04D6e709A9eB0e7c04D515Fa73f37F2ef873E502 (5.0, 5.0, 2.45, 113.0, 0.0, 0.0, 0.0, 3263.0, 45.0, 3329.0, 0.0, 40.0)

Similar Row Group 2:
0xd8c2B741cb31e5352dC32fb3F0819D03A2c4c01c (4.0, 4.0, 19.866666666666667, 313.0, 233.0, 1.0, 0.0, 9144.0, 20.0, 843.0, 0.0, 15.0)
0xD8Fd11d65f3f9805e40526892f3034C75a2580D3 (4.0, 4.0, 17.470588235294116, 313.0, 233.0, 1.0, 0.0, 9155.0, 22.0, 843.0, 0.0, 17.0)

Similar Row Group 3:
0xc098b08b4D7dC5deea18Bc917ab2d5aE542910D0 (4.0, 4.0, 21.428571428571427, 313.0, 0.0, 0.0, 0.0, 8141.0, 22.0, 397.0, 0.0, 14.0)
0xBFbd5547568D9F31Fd4

In [None]:
beta_identified_sybils = pd.DataFrame(similar_supporters_beta,columns=['address'])
beta_identified_sybils.to_csv('identified_sybils/beta.csv',index=False)