In [1]:
import pandas as pd
import json
import re
import csv
import pyTigerGraph as tg


# Notebook to explore and generate a graph for the Domain Record datasets

* Mangle data to make loading into the graph easier
* Instantiate the graph schema and graph queries (using TigerGraph - www.tigergraph.com )
* Perform community detection
* Collect community statistics to be used as features


In [2]:
!wc -l "../data/processed/better_whois_data.csv" 1

    9344 ../data/processed/better_whois_data.csv
wc: 1: open: No such file or directory
    9344 total


In [3]:
# benign_whois_data.txt

!head "../data/raw/benign_whois_data.txt" 1


==> ../data/raw/benign_whois_data.txt <==
{"griffithspartners.com.au": {"domain_name": "COM.AU", "name_servers": [ "T.AU", "R.AU", "Q.AU", "S.AU" ], "registrant_contact_name": "CEO", "registrant_name": null, "registrar": "Afilias Australia Pty Ltd", "status": [ "serverDeleteProhibited https://afilias.com.au/get-au/whois-status-codes#serverDeleteProhibited", "serverRenewProhibited https://afilias.com.au/get-au/whois-status-codes#serverRenewProhibited", "serverTransferProhibited https://afilias.com.au/get-au/whois-status-codes#serverTransferProhibited", "serverUpdateProhibited https://afilias.com.au/get-au/whois-status-codes#serverUpdateProhibited" ], "updated_date": "2022-03-17 14:34:56"}}
{"gestaoconcurso.com.br": {"admin_c": null, "billing_c": null, "country": null, "creation_date": null, "domain_name": null, "email": null, "expiration_date": null, "name_server": null, "nic_hdl_br": null, "nslastaa": null, "nsstat": null, "owner_c": null, "person": null, "registrant_id": null, "regi

In [4]:
# Process the benign data file - row by row processing of json / dictionary file - Cumbersome but cautious

b = '../data/raw/benign_whois_data.txt'

records = []
error_lines = []

# load the lines into a list, there are some error lines that needs to be dealt with
with open(b) as f: # open file
    for line in f.read().splitlines(): # for each line
        try:
            obj = json.loads(re.sub('.*}{', '{', line)) # load the line, which looks like: {"domain.name": {...values...}}
            records.append(obj)
        except:
            error_lines.append(line)
        
print(f"Number of line errors: {len(error_lines)}")  
print(f"Number of lines: {len(records)}")     


Number of line errors: 0
Number of lines: 4001


In [5]:
record_dict = {}
for item in records:
    name = [x for x in item.keys()][0]
    record_dict[name] = item[name]

In [6]:
# Look for and process any errors -  There should now be none 
# (initial stages of project there were some errors in the data)

error_list = []
for x in record_dict:
    if type(record_dict[x]) != dict:
        error_list.append(x)
        
for x in error_list:
    del record_dict[x]

In [7]:
benign_df = pd.DataFrame.from_dict(record_dict, orient='index')
benign_df.reset_index(inplace=True)

In [8]:
benign_df.head(3)

Unnamed: 0,index,domain_name,name_servers,registrant_contact_name,registrant_name,registrar,status,updated_date,admin_c,billing_c,...,registrant_zip,billing_id,billing_phone_number,billing_postal_code,billing_state_province,admin_country_code,registrar_country_code,registrar_organization_id,registrar_postal_code,registrar_street
0,griffithspartners.com.au,COM.AU,"[T.AU, R.AU, Q.AU, S.AU]",CEO,,Afilias Australia Pty Ltd,[serverDeleteProhibited https://afilias.com.au...,2022-03-17 14:34:56,,,...,,,,,,,,,,
1,gestaoconcurso.com.br,,,,,,,,,,...,,,,,,,,,,
2,21fss.com,"[21FSS.COM, 21fss.com]","[NS33.DOMAINCONTROL.COM, NS34.DOMAINCONTROL.COM]",,,"GoDaddy.com, LLC",[clientDeleteProhibited https://icann.org/epp#...,"[2021-01-19 22:26:36, 2021-01-10 00:51:38]",,,...,,,,,,,,,,


In [9]:
# Visual check of column names

pd.options.display.max_rows = 4000
print(str(list(benign_df.columns)))

['index', 'domain_name', 'name_servers', 'registrant_contact_name', 'registrant_name', 'registrar', 'status', 'updated_date', 'admin_c', 'billing_c', 'country', 'creation_date', 'email', 'expiration_date', 'name_server', 'nic_hdl_br', 'nslastaa', 'nsstat', 'owner_c', 'person', 'registrant_id', 'saci', 'tech_c', 'address', 'city', 'dnssec', 'emails', 'name', 'org', 'referral_url', 'state', 'whois_server', 'zipcode', 'country_code', 'fax', 'phone', 'admin_address', 'admin_name', 'admin_organization', 'registrant_address', 'registrant_organization', 'registrar_address', 'registrar_name', 'tech_address', 'tech_name', 'tech_organization', 'registrar_city', 'registrar_country', 'registrar_zip_code', 'transfer_date', 'domain_status', 'registrant_number', 'registrar_url', 'organization', 'registrar_iana', 'registrant_org', 'registrant_city', 'registrant_country', 'registrant_street', 'registrant_type', 'tech_email', 'tech_fax', 'tech_org', 'tech_phone', 'admin_account_name', 'admin_company_nam

In [10]:
# Process the malicious data set (just like we did th
e benign dataset)

b = '../data/raw/malicious_whois_data.txt'

records = []
error_lines = []

# load the lines into a list, there are some error lines that needs to be dealt with
with open(b) as f: # open file
    for line in f.read().splitlines(): # for each line
        try:
            obj = json.loads(re.sub('.*}{', '{', line)) # load the line, which looks like: {"domain.name": {...values...}}
            records.append(obj)
        except:
            error_lines.append(line)
        
print(f"Number of line errors: {len(error_lines)}")  
print(f"Number of lines: {len(records)}")     


Number of line errors: 0
Number of lines: 2415


In [11]:
# create the dictionary with the data
record_dict = {}
for item in records:
    name = [x for x in item.keys()][0]
    record_dict[name] = item[name]

In [12]:
# Process any errors

error_list = []
for x in record_dict:
    if type(record_dict[x]) != dict:
        error_list.append(x)
        
for x in error_list:
    del record_dict[x]

In [13]:
mal_df = pd.DataFrame.from_dict(record_dict, orient='index')
mal_df.reset_index(inplace=True)

In [14]:
mal_df.head(3)

Unnamed: 0,index,address,city,country,creation_date,dnssec,domain_name,emails,expiration_date,name,...,tech_phone_ext,registrar_zip_code,registrant_type,admin_account_name,admin_company_name,admin_family_name,registrant_company_name,tech_account_name,tech_company_name,tech_family_name
0,reseptors.com,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,KN,"[2021-12-08 08:53:26, 2021-12-08T08:53:26]",unsigned,RESEPTORS.COM,domainabuse@tucows.com,"[2022-12-08 08:53:26, 2022-12-08T08:53:26]",REDACTED FOR PRIVACY,...,,,,,,,,,,
1,contentcdns.net,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,KN,"[2022-02-17 23:18:15, 2022-02-17T23:18:15]",unsigned,CONTENTCDNS.NET,domainabuse@tucows.com,"[2023-02-17 23:18:15, 2023-02-17T23:18:15]",REDACTED FOR PRIVACY,...,,,,,,,,,,
2,izocab.com,"220013, Belarus, Minsk, ul.YA.Kolasa, d.31, kv...",Minsk,BY,2015-12-30 00:30:35,"[unsigned, Unsigned]",IZOCAB.COM,"[abuse@reg.ru, info@brale.ru]",2022-12-30 00:30:35,Vladimir Nikolskii,...,,,,,,,,,,


#########

In [15]:
mal_df.columns 

Index(['index', 'address', 'city', 'country', 'creation_date', 'dnssec',
       'domain_name', 'emails', 'expiration_date', 'name',
       ...
       'tech_phone_ext', 'registrar_zip_code', 'registrant_type',
       'admin_account_name', 'admin_company_name', 'admin_family_name',
       'registrant_company_name', 'tech_account_name', 'tech_company_name',
       'tech_family_name'],
      dtype='object', length=119)

In [16]:
benign_df.columns

Index(['index', 'domain_name', 'name_servers', 'registrant_contact_name',
       'registrant_name', 'registrar', 'status', 'updated_date', 'admin_c',
       'billing_c',
       ...
       'registrant_zip', 'billing_id', 'billing_phone_number',
       'billing_postal_code', 'billing_state_province', 'admin_country_code',
       'registrar_country_code', 'registrar_organization_id',
       'registrar_postal_code', 'registrar_street'],
      dtype='object', length=156)

In [17]:
mal_df['malicious'] = 1
benign_df['malicious'] = 0

In [18]:
combined_df = pd.concat([mal_df,benign_df], axis=0)

In [19]:
len(combined_df['index'])

6342

In [20]:
DomainRecord_cols = ['index', 'dnssec', 'name', 'malicious']

combined_df.loc[:,DomainRecord_cols].to_csv("../data/external/combined_whois_data.csv")

In [21]:
list_of_explode_columns = ['country', 'emails','whois_server', 'domain_status', 'registrar', 
                           'name_servers']

In [22]:
for _col in list_of_explode_columns:
    combined_df.loc[:,['index',_col]].explode(_col).to_csv(
    f"../data/external/combined_whois_data_{_col}.csv")

In [23]:
### Entropy data import

In [24]:
# Read in the file
with open('../data/raw/benign_entropy_data.txt', 'r') as file :
  filedata = file.read()

# Replace the target string
filedata = filedata.replace('{', '')
filedata = filedata.replace('}', '')
filedata = filedata.replace(':', ',')

# Write the file out again
with open('../data/external/benign_entropy_data.txt', 'w') as file:
  file.write(filedata)

In [25]:
# Read in the file
with open('../data/raw/malicious_entropy_data.txt', 'r') as file :
  filedata = file.read()

# Replace the target string
filedata = filedata.replace('{', '')
filedata = filedata.replace('}', '')
filedata = filedata.replace(':', ',')

# Write the file out again
with open('../data/external/malicious_entropy_data.txt', 'w') as file:
  file.write(filedata)

In [26]:
### Load IP Address Org details

import pandas as pd
import json

b = '../data/raw/benign_ip_data.txt'

records = []
error_lines = []

# load the lines into a list, there are some error lines that needs to be dealt with
with open(b) as f: # open file
    for line in f.read().splitlines(): # for each line
        try:
            obj = json.loads(re.sub('.*}{', '{', line)) # load the line, which looks like: {"domain.name": {...values...}}
            records.append(obj)
        except:
            error_lines.append(line)
        
print(f"Number of line errors: {len(error_lines)}")  
print(f"Number of lines: {len(records)}")

Number of line errors: 0
Number of lines: 6670


In [27]:
b = '../data/raw/malicious_ip_data.txt'

# load the lines into a list, there are some error lines that needs to be dealt with
with open(b) as f: # open file
    for line in f.read().splitlines(): # for each line
        try:
            obj = json.loads(re.sub('.*}{', '{', line)) # load the line, which looks like: {"domain.name": {...values...}}
            records.append(obj)
        except:
            error_lines.append(line)
        
print(f"Number of line errors: {len(error_lines)}")  
print(f"Number of lines: {len(records)}")

Number of line errors: 0
Number of lines: 9085


In [28]:
record_dict = {}
for item in records:
    name = [x for x in item.keys()][0]
    record_dict[name] = item[name]

In [29]:
error_list = []
for x in record_dict:
    if type(record_dict[x]) != dict:
        error_list.append(x)
        
for x in error_list:
    del record_dict[x]

In [30]:
org_df = pd.DataFrame.from_dict(record_dict, orient='index')
org_df.reset_index(inplace=True)

In [31]:
org_df['A_Org'] = org_df['A'].apply(pd.Series)['Org']
org_df['MX_Org'] = org_df['MX'].apply(pd.Series)['Org']

In [32]:
org_df.sample(10)

Unnamed: 0,index,A,MX,A_Org,MX_Org
6199,3.baiduyuna.tk,"{'CC': 'US', 'Org': 'Cloudflare, Inc.'}","{'CC': 'NA', 'Org': 'NA'}","Cloudflare, Inc.",
6020,tiloktarifh.top,"{'CC': 'NA', 'Org': 'NA'}","{'CC': 'NA', 'Org': 'NA'}",,
4260,compostobiro.click,"{'CC': 'NA', 'Org': 'NA'}","{'CC': 'NA', 'Org': 'NA'}",,
3972,whatsappespiarapp.com,"{'CC': 'US', 'Org': 'Cloudflare, Inc.'}","{'CC': 'GB', 'Org': 'Carlos Alberto Weand Ortiz'}","Cloudflare, Inc.",Carlos Alberto Weand Ortiz
818,americanrestroom.org,"{'CC': 'US', 'Org': 'WEBSITEWELCOME.COM'}","{'CC': 'US', 'Org': 'Microsoft Corporation'}",WEBSITEWELCOME.COM,Microsoft Corporation
5304,www.contorig2.com,"{'CC': 'US', 'Org': 'Namecheap, Inc.'}","{'CC': 'NA', 'Org': 'NA'}","Namecheap, Inc.",
798,employer.gov,"{'CC': 'US', 'Org': 'US Department of Labor - ...","{'CC': 'NA', 'Org': 'NA'}",US Department of Labor - OASAM,
5868,bumoyez.com,"{'CC': 'NA', 'Org': 'NA'}","{'CC': 'NA', 'Org': 'NA'}",,
5136,osdnvnauurt.xyz,"{'CC': 'NA', 'Org': 'NA'}","{'CC': 'NA', 'Org': 'NA'}",,
4759,jrmcsdjriesibcuuhbgosbpuaebssiae.top,"{'CC': 'US', 'Org': 'Cloudflare, Inc.'}","{'CC': 'NA', 'Org': 'NA'}","Cloudflare, Inc.",


In [33]:
org_df.loc[org_df['MX_Org']!="NA",["index","MX_Org"]].to_csv("../data/external/combined_mx_org.csv")
org_df.loc[org_df['A_Org']!="NA",["index","A_Org"]].to_csv("../data/external/combined_A_org.csv")

In [34]:
!docker ps

CONTAINER ID   IMAGE                                     COMMAND                  CREATED       STATUS      PORTS                                                                                                                               NAMES
38e46715231e   docker.tigergraph.com/tigergraph:latest   "/bin/sh -c '/usr/sb…"   3 weeks ago   Up 7 days   0.0.0.0:9000->9000/tcp, :::9000->9000/tcp, 0.0.0.0:14240->14240/tcp, :::14240->14240/tcp, 0.0.0.0:14022->22/tcp, :::14022->22/tcp   tigergraph


In [35]:
# To start tigergraph docker
# https://docs.tigergraph.com/tigergraph-server/current/getting-started/docker
#  docker run -d -p 14022:22 -p 9000:9000 -p 14240:14240 --name tigergraph --ulimit nofile=1000000:1000000 -v /Users/nb311848/Documents/mystuff/repos/artemis/data/external:/home/tigergraph/data -t docker.tigergraph.com/tigergraph:latest


In [50]:
conn = tg.TigerGraphConnection()

In [51]:
# Clear the Tigergraph server/graph as we want to reproduce the graph creation from scratch
## CAUTION: this wipes everything inside your TigerGraph container !!

print(conn.gsql('drop all', options=[]))

Dropping all, about 1 minute ...
Abort all active loading jobs
Try to abort all loading jobs on graph Artemis, it may take a while ...
[ABORT_SUCCESS] No active Loading Job to abort.
Resetting GPE...
Successfully reset GPE and GSE
Stopping GPE GSE
Successfully stopped GPE GSE in 0.009 seconds
Clearing graph store...
Successfully cleared graph store
Starting GPE GSE RESTPP
Successfully started GPE GSE RESTPP in 0.174 seconds
Everything is dropped.


In [52]:
# Read in the GSQL code that creates the Graph / Schema / Loading Jobs and Queries on TigerGraph

text_file = open("../src/scripts/DBImportExport_Artemis.gsql", "r")
 
#read whole file to a string
artemis_graph_gsql = text_file.read()
 
#close file
text_file.close()
 
# print(artemis_graph_gsql)

In [53]:
# Run the GSQL script and print the results (Check for any errors)

print(conn.gsql(artemis_graph_gsql, options=[]))

Stopping GPE GSE RESTPP
Successfully stopped GPE GSE RESTPP in 30.522 seconds
Starting GPE GSE RESTPP
Successfully started GPE GSE RESTPP in 0.140 seconds
The graph Artemis is created.
Successfully created schema change jobs: [create_artemis_graph].

Current graph version 0
Trying to add vertex DomainRecord.
Trying to add vertex DomainName.
Trying to add vertex Country.
Trying to add vertex City.
Trying to add vertex Emails.
Trying to add vertex Organisation.
Trying to add vertex Nameserver.
Trying to add vertex Registrar.
Trying to add vertex DomainRecordStatus.
Trying to add vertex WhoisServer.
Trying to add edge DomainRecord_DomainName.
Trying to add edge Country_City.
Trying to add edge DomainRecord_Country.
Trying to add edge DomainRecord_Emails.
Trying to add edge DomainRecord_Organisation.
Trying to add edge DomainRecord_Nameserver.
Trying to add edge DomainRecord_Registrar.
Trying to add edge DomainRecord_DomainRecordStatus.
Trying to add edge DomainRecord_WhoisServer.
Trying t

In [54]:
# Generic loading job string

loading_job = """RUN LOADING JOB load_job_whois_data{field} USING MyDataSource="/home/tigergraph/data/combined_whois_data{field}.csv\""""

In [55]:
# Execute the loading job string for the listed data files

print(conn.gsql(loading_job.format(field=""),graphname='Artemis', options=[]))

for x in list_of_explode_columns:
    print(conn.gsql(loading_job.format(field=f"_{x}"),graphname='Artemis', options=[]))                       

[2A
[2K
[2K
[Tip: Use "CTRL + C" to stop displaying the loading status update, then use "SHOW LOADING STATUS jobid" to track the loading progress again]
[Tip: Manage loading jobs with "ABORT/RESUME LOADING JOB jobid"]
Starting the following job, i.e.
JobName: load_job_whois_data, jobid: Artemis.load_job_whois_data.file.m1.1649746745425
Loading log: '/home/tigergraph/tigergraph/log/restpp/restpp_loader_logs/Artemis/Artemis.load_job_whois_data.file.m1.1649746745425.log'

Job "Artemis.load_job_whois_data.file.m1.1649746745425" loading status
[WAITING] m1 ( Finished: 0 / Total: 0 )
Job "Artemis.load_job_whois_data.file.m1.1649746745425" loading status
[FINISHED] m1 ( Finished: 1 / Total: 1 )
[LOADED]
+-----------------------------------------------------------------------------------------+
|                                     FILENAME |   LOADED LINES |   AVG SPEED |   DURATION|
|/home/tigergraph/data/combined_whois_data.csv |           6343 |      9 kl/s |     0.64 s|
+--------------

[2A
[2K
[2K
[7A
[2K
[2K
[2K
[2K
[2K
[2K
[2K
[Tip: Use "CTRL + C" to stop displaying the loading status update, then use "SHOW LOADING STATUS jobid" to track the loading progress again]
[Tip: Manage loading jobs with "ABORT/RESUME LOADING JOB jobid"]
Starting the following job, i.e.
JobName: load_job_whois_data_name_servers, jobid: Artemis.load_job_whois_data_name_servers.file.m1.1649746798785
Loading log: '/home/tigergraph/tigergraph/log/restpp/restpp_loader_logs/Artemis/Artemis.load_job_whois_data_name_servers.file.m1.1649746798785.log'

Job "Artemis.load_job_whois_data_name_servers.file.m1.1649746798785" loading status
[WAITING] m1 ( Finished: 0 / Total: 0 )
Job "Artemis.load_job_whois_data_name_servers.file.m1.1649746798785" loading status
[RUNNING] m1 ( Finished: 1 / Total: 1 )
[LOADED]
+------------------------------------------------------------------------------------------------------+
|                                                  FILENAME |   LOADED LINES |   A

In [56]:
# Load entropy data
loading_job = 'RUN LOADING JOB load_job_entropy USING MyDataSource="/home/tigergraph/data/benign_entropy_data.txt"'
print(conn.gsql(loading_job, graphname='Artemis', options=[]))

loading_job = 'RUN LOADING JOB load_job_entropy USING MyDataSource="/home/tigergraph/data/malicious_entropy_data.txt"'
print(conn.gsql(loading_job, graphname='Artemis', options=[]))

[2A
[2K
[2K
[Tip: Use "CTRL + C" to stop displaying the loading status update, then use "SHOW LOADING STATUS jobid" to track the loading progress again]
[Tip: Manage loading jobs with "ABORT/RESUME LOADING JOB jobid"]
Starting the following job, i.e.
JobName: load_job_entropy, jobid: Artemis.load_job_entropy.file.m1.1649746809454
Loading log: '/home/tigergraph/tigergraph/log/restpp/restpp_loader_logs/Artemis/Artemis.load_job_entropy.file.m1.1649746809454.log'

Job "Artemis.load_job_entropy.file.m1.1649746809454" loading status
[WAITING] m1 ( Finished: 0 / Total: 0 )
Job "Artemis.load_job_entropy.file.m1.1649746809454" loading status
[FINISHED] m1 ( Finished: 1 / Total: 1 )
[LOADED]
+-----------------------------------------------------------------------------------------+
|                                     FILENAME |   LOADED LINES |   AVG SPEED |   DURATION|
|/home/tigergraph/data/benign_entropy_data.txt |           4001 |      6 kl/s |     0.62 s|
+-----------------------------

In [57]:
# Load MX / A org data
loading_job = 'RUN LOADING JOB load_job_A_org USING MyDataSource="/home/tigergraph/data/combined_A_org.csv"'
print(conn.gsql(loading_job, graphname='Artemis', options=[]))

loading_job = 'RUN LOADING JOB load_job_MX_org USING MyDataSource="/home/tigergraph/data/combined_MX_org.csv"'
print(conn.gsql(loading_job, graphname='Artemis', options=[]))

[2A
[2K
[2K
[Tip: Use "CTRL + C" to stop displaying the loading status update, then use "SHOW LOADING STATUS jobid" to track the loading progress again]
[Tip: Manage loading jobs with "ABORT/RESUME LOADING JOB jobid"]
Starting the following job, i.e.
JobName: load_job_A_org, jobid: Artemis.load_job_A_org.file.m1.1649746825508
Loading log: '/home/tigergraph/tigergraph/log/restpp/restpp_loader_logs/Artemis/Artemis.load_job_A_org.file.m1.1649746825508.log'

Job "Artemis.load_job_A_org.file.m1.1649746825508" loading status
[WAITING] m1 ( Finished: 0 / Total: 0 )
Job "Artemis.load_job_A_org.file.m1.1649746825508" loading status
[FINISHED] m1 ( Finished: 1 / Total: 1 )
[LOADED]
+------------------------------------------------------------------------------------+
|                                FILENAME |   LOADED LINES |   AVG SPEED |   DURATION|
|/home/tigergraph/data/combined_A_org.csv |           4158 |      5 kl/s |     0.73 s|
+------------------------------------------------------

In [58]:
# Install all queries on TigerGraph (This makes query execution much faster)

print(conn.gsql('INSTALL QUERY ALL', options=[]))

Start installing queries, about 1 minute ...
delete_co_edges query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/delete_co_edges'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.
tg_louvain query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/tg_louvain?v_type=VALUE&e_type=VALUE&[wt_attr=VALUE]&[max_iter=VALUE]&[result_attr=VALUE]&[file_path=VALUE]&[print_info=VALUE]'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.
community_stuff query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/community_stuff'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.
tg_label_prop query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/tg_label_prop?v_type=VALUE&e_type=VALUE&max_iter=VALUE&output_limit=VALUE&[print_accum=VALUE]&[file_path=VALUE]&[attr=VALUE]'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.
community_features_calc query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/community_features_calc'. Add -H "

In [59]:
conn.graphname = 'Artemis'

In [80]:
# Run co_edge creation

print(conn.runInstalledQuery("community_stuff", timeout = 30000))
print(conn.runInstalledQuery("delete_co_loop_edges", timeout = 30000))

[]
[]


In [63]:
# run label propogation

# tg_label_prop (SET<STRING> v_type, SET<STRING> e_type, INT max_iter, INT output_limit, 
#  BOOL print_accum = TRUE, STRING file_path = "", STRING attr = "") 

# have to use query strings until pyTigerGrpah allows lists for sets

params = "v_type=DomainRecord&e_type=co_registrar&e_type=c_org&e_type=co_nameserver&max_iter=10000&output_limit=0&print_accum=1&attr=community"

result = conn.runInstalledQuery("tg_label_prop", params=params, timeout = 30000)

In [64]:
# Extract the community features

result = conn.runInstalledQuery("community_features_calc", timeout = 30000)
community_features = conn.runInstalledQuery("community_features_calc", timeout = 30000)


In [65]:
community_features[0]['(@@group_entropy_final)']

[{'community': 2097169,
  'min_entropy': 2.40564,
  'max_entropy': 3.2389,
  'avg_entropy': 2.82897,
  'domain_count': 3,
  'malicious_ratio': 0},
 {'community': 4194347,
  'min_entropy': 1.79248,
  'max_entropy': 2.80735,
  'avg_entropy': 2.37383,
  'domain_count': 3,
  'malicious_ratio': 0},
 {'community': 14680090,
  'min_entropy': 1.92193,
  'max_entropy': 3.375,
  'avg_entropy': 2.51619,
  'domain_count': 3,
  'malicious_ratio': 0},
 {'community': 172,
  'min_entropy': 1.6494,
  'max_entropy': 3.23593,
  'avg_entropy': 2.46496,
  'domain_count': 7,
  'malicious_ratio': 0.57143},
 {'community': 5242927,
  'min_entropy': 1.58496,
  'max_entropy': 3.02717,
  'avg_entropy': 2.44505,
  'domain_count': 4,
  'malicious_ratio': 0},
 {'community': 10485770,
  'min_entropy': 2.25163,
  'max_entropy': 3.32782,
  'avg_entropy': 2.75377,
  'domain_count': 4,
  'malicious_ratio': 0},
 {'community': 10,
  'min_entropy': 2.41938,
  'max_entropy': 3.37878,
  'avg_entropy': 2.85947,
  'domain_count

In [66]:
community_features_df = pd.DataFrame(community_features[0]['(@@group_entropy_final)'])

In [67]:
community_features_df.sample(3)

Unnamed: 0,community,min_entropy,max_entropy,avg_entropy,domain_count,malicious_ratio
9,53,2.28104,2.9477,2.71424,3,0.0
86,2097197,1.79248,3.02206,2.52104,4,0.0
108,23068862,2.25163,3.45656,2.94963,19,0.0


In [68]:
DomainRecordsGSQL = """INTERPRET QUERY () FOR GRAPH Artemis {
   t = select dr from DomainRecord:dr;
   print(t);
}"""

domain_records = conn.runInterpretedQuery(DomainRecordsGSQL)


In [69]:
domain_records[0]['(t)']

[{'v_id': 'creatorslive.co.za',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'creatorslive.co.za',
   'dnssec': '',
   'name': '',
   'entropy': 3.25163,
   'MaliciousFlag': False,
   'community': -1}},
 {'v_id': 'zaupdates.com',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'zaupdates.com',
   'dnssec': 'unsigned',
   'name': 'Redacted for Privacy',
   'entropy': 2.9477,
   'MaliciousFlag': False,
   'community': 27263047}},
 {'v_id': 'hasuc.cn',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'hasuc.cn',
   'dnssec': 'unsigned',
   'name': '上海和呈仪器制造有限公司',
   'entropy': 2.32193,
   'MaliciousFlag': False,
   'community': 20971685}},
 {'v_id': 'wellturkey.ru',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'wellturkey.ru',
   'dnssec': '',
   'name': '',
   'entropy': 2.92193,
   'MaliciousFlag': False,
   'community': 3}},
 {'v_id': 'chatirwebdesign.com',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'chatirwebdesign.com',
   'dnssec': 'unsigned',
   'name': 'Do

In [70]:
dr_dict = {}

for x in domain_records[0]['(t)']:
    dr_dict[x['v_id']] = x['attributes']['community']

In [71]:
dr_dict

{'creatorslive.co.za': -1,
 'zaupdates.com': 27263047,
 'hasuc.cn': 20971685,
 'wellturkey.ru': 3,
 'chatirwebdesign.com': 16777277,
 'editorialkokinos.com': 5,
 'protektor.de': -1,
 'catalog.tools': 29360310,
 'everythingautomotiverepairs.com': 16777393,
 'easywebbrowsing.com': 18874492,
 'zakarpatpost.net': 10,
 '90750d.com': 16777393,
 'jzgushi.cn': 17825983,
 'zwof.cn': -1,
 'kgrkllo.icu': 17825983,
 'meanwellusa.com': 15728788,
 'wamtimes.com': 16777393,
 'crimezzz.net': 17,
 'ic.gov': -1,
 'privacyenforcement.net': 16777393,
 'nikonhunting.com': 20,
 'diannaobos.com': 18874566,
 'clady.cn': -1,
 'sakva.ru': 22020121,
 'falco.nl': -1,
 'mohalicallgirlsagency.in': 16777393,
 'gulfconstructiononline.com': 15728788,
 'exlyrics.com': 27263047,
 'mossycreekflyfishing.com': 16777393,
 'chinaecec.com': 29,
 'ultirecruit.com': 15728788,
 'islamic-world.net': 11534412,
 'esl.tv': 32,
 'underarmour.com.my': -1,
 'st-george-realestate.com': 16777393,
 's-pravdoy.ru': 20971715,
 'inotdj.com':

In [72]:
domain_record_df = pd.DataFrame.from_dict(dr_dict, orient='index').reset_index()
domain_record_df.columns = ['DomainRecord','community']

In [73]:
domain_record_df.sample(3)

Unnamed: 0,DomainRecord,community
3752,cdcli.org,15728788
817,jajoinc.net,-1
6294,ijhbrphodechmcrdjudccegmicembsir.live,-1


In [74]:
graph_features_df = domain_record_df.merge(community_features_df, how = 'left', on='community')

In [75]:
graph_features_df.sample(10)

Unnamed: 0,DomainRecord,community,min_entropy,max_entropy,avg_entropy,domain_count,malicious_ratio
3626,www.ifnprhfyflwgthmewfnm.com,28311620,1.37095,4.3232,3.5706,246,0.97561
3934,jockeysridgestatepark.com,1048769,2.25163,3.6897,2.94919,7,0.0
2037,avidityscience.com,17825941,1.52193,3.65376,2.87125,47,0.02128
6007,bunkerbranding.com,29360276,0.81128,3.91613,2.93267,54,0.07407
6255,encounterilin.top,1048711,2.15564,4.32928,2.96577,262,0.96183
6303,nbforresthome.org,17825941,1.52193,3.65376,2.87125,47,0.02128
3398,onlinecheapautoinsur.com,2097171,3.0,3.93214,3.50544,3,0.33333
3635,bunmud42.top,23068804,1.5,4.51378,2.87389,257,0.97665
5770,readster.de,-1,0.0,4.02895,2.84008,1207,0.33803
2156,capeusa.org,16777393,0.9183,3.87314,2.99605,778,0.02699


In [78]:
graph_features_df.to_csv('../data/processed/graph_community_features.csv', index=False)

In [77]:
len(community_features_df)

156