In [1]:
import pandas as pd
import json
import re
import csv
import pyTigerGraph as tg
from sklearn.model_selection import train_test_split


# Notebook to explore and generate a graph for the Domain Record datasets

* Mangle data to make loading into the graph easier
* Instantiate the graph schema and graph queries (using TigerGraph - www.tigergraph.com )
* Perform community detection
* Collect community statistics to be used as features


In [5]:
!wc -l "../data/processed/better_whois_data.csv" 

    9344 ../data/processed/better_whois_data.csv


In [6]:
# benign_whois_data.txt

!wc -l "../data/raw/benign_whois_data.txt" 


    4000 ../data/raw/benign_whois_data.txt


In [7]:
# Process the benign data file - row by row processing of json / dictionary file - Cumbersome but cautious

b = '../data/raw/benign_whois_data.txt'

records = []
error_lines = []

# load the lines into a list, there are some error lines that needs to be dealt with
with open(b) as f: # open file
    for line in f.read().splitlines(): # for each line
        try:
            obj = json.loads(re.sub('.*}{', '{', line)) # load the line, which looks like: {"domain.name": {...values...}}
            records.append(obj)
        except:
            error_lines.append(line)
        
print(f"Number of line errors: {len(error_lines)}")  
print(f"Number of lines: {len(records)}")     


Number of line errors: 0
Number of lines: 4001


In [8]:
record_dict = {}
for item in records:
    name = [x for x in item.keys()][0]
    record_dict[name] = item[name]

In [9]:
# Look for and process any errors -  There should now be none 
# (initial stages of project there were some errors in the data)

error_list = []
for x in record_dict:
    if type(record_dict[x]) != dict:
        error_list.append(x)
        
for x in error_list:
    del record_dict[x]

In [10]:
benign_df = pd.DataFrame.from_dict(record_dict, orient='index')
benign_df.reset_index(inplace=True)

In [11]:
benign_df.head(3)

Unnamed: 0,index,domain_name,name_servers,registrant_contact_name,registrant_name,registrar,status,updated_date,admin_c,billing_c,...,registrant_zip,billing_id,billing_phone_number,billing_postal_code,billing_state_province,admin_country_code,registrar_country_code,registrar_organization_id,registrar_postal_code,registrar_street
0,griffithspartners.com.au,COM.AU,"[T.AU, R.AU, Q.AU, S.AU]",CEO,,Afilias Australia Pty Ltd,[serverDeleteProhibited https://afilias.com.au...,2022-03-17 14:34:56,,,...,,,,,,,,,,
1,gestaoconcurso.com.br,,,,,,,,,,...,,,,,,,,,,
2,21fss.com,"[21FSS.COM, 21fss.com]","[NS33.DOMAINCONTROL.COM, NS34.DOMAINCONTROL.COM]",,,"GoDaddy.com, LLC",[clientDeleteProhibited https://icann.org/epp#...,"[2021-01-19 22:26:36, 2021-01-10 00:51:38]",,,...,,,,,,,,,,


In [13]:
!wc -l "../data/raw/malicious_whois_data.txt" 

    2415 ../data/raw/malicious_whois_data.txt


In [14]:
# Process the malicious data set (just like we did the benign dataset)

b = '../data/raw/malicious_whois_data.txt'

records = []
error_lines = []

# load the lines into a list, there are some error lines that needs to be dealt with
with open(b) as f: # open file
    for line in f.read().splitlines(): # for each line
        try:
            obj = json.loads(re.sub('.*}{', '{', line)) # load the line, which looks like: {"domain.name": {...values...}}
            records.append(obj)
        except:
            error_lines.append(line)
        
print(f"Number of line errors: {len(error_lines)}")  
print(f"Number of lines: {len(records)}")     


Number of line errors: 0
Number of lines: 2415


In [15]:
# create the dictionary with the data
record_dict = {}
for item in records:
    name = [x for x in item.keys()][0]
    record_dict[name] = item[name]

In [16]:
# Process any errors

error_list = []
for x in record_dict:
    if type(record_dict[x]) != dict:
        error_list.append(x)
        
for x in error_list:
    del record_dict[x]

In [17]:
mal_df = pd.DataFrame.from_dict(record_dict, orient='index')
mal_df.reset_index(inplace=True)

In [18]:
mal_df.head(3)

Unnamed: 0,index,address,city,country,creation_date,dnssec,domain_name,emails,expiration_date,name,...,tech_phone_ext,registrar_zip_code,registrant_type,admin_account_name,admin_company_name,admin_family_name,registrant_company_name,tech_account_name,tech_company_name,tech_family_name
0,reseptors.com,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,KN,"[2021-12-08 08:53:26, 2021-12-08T08:53:26]",unsigned,RESEPTORS.COM,domainabuse@tucows.com,"[2022-12-08 08:53:26, 2022-12-08T08:53:26]",REDACTED FOR PRIVACY,...,,,,,,,,,,
1,contentcdns.net,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,KN,"[2022-02-17 23:18:15, 2022-02-17T23:18:15]",unsigned,CONTENTCDNS.NET,domainabuse@tucows.com,"[2023-02-17 23:18:15, 2023-02-17T23:18:15]",REDACTED FOR PRIVACY,...,,,,,,,,,,
2,izocab.com,"220013, Belarus, Minsk, ul.YA.Kolasa, d.31, kv...",Minsk,BY,2015-12-30 00:30:35,"[unsigned, Unsigned]",IZOCAB.COM,"[abuse@reg.ru, info@brale.ru]",2022-12-30 00:30:35,Vladimir Nikolskii,...,,,,,,,,,,


#########

In [20]:
mal_df['malicious'] = 1
benign_df['malicious'] = 0

In [21]:
combined_df = pd.concat([mal_df,benign_df], axis=0)

In [22]:
len(combined_df['index'])

6342

In [23]:
random_seed = 0
primary_dataset = pd.read_csv('../data/processed/better_whois_data.csv', low_memory=False) #6510 rows
y_col = 'malicious'
y = primary_dataset[y_col]
X = primary_dataset[primary_dataset.columns.drop(y_col)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

print(f"Total Rows: {len(primary_dataset)}")
print(f"Train rows: {len(X_train)}")

Total Rows: 9128
Train rows: 7302


In [24]:
DomainRecord_cols = ['index', 'dnssec', 'name', 'malicious']

combined_df.loc[:,DomainRecord_cols].to_csv("../data/external/combined_whois_data.csv")
combined_df.loc[combined_df['index'].isin(X_train['domain']),DomainRecord_cols].to_csv("../data/external/combined_whois_data_train.csv")
combined_df.loc[combined_df['index'].isin(X_test['domain']),DomainRecord_cols].to_csv("../data/external/combined_whois_data_test.csv")

In [25]:
list_of_explode_columns = ['country', 'emails','whois_server', 'domain_status', 'registrar', 
                           'name_servers']

In [27]:
for _col in list_of_explode_columns:
    combined_df.loc[:,['index',_col]].explode(_col).to_csv(
    f"../data/external/combined_whois_data_{_col}.csv")
    
    combined_df.loc[combined_df['index'].isin(X_train['domain']),['index',_col]].explode(_col).to_csv(
    f"../data/external/combined_whois_data_{_col}_train.csv")
    
    combined_df.loc[combined_df['index'].isin(X_test['domain']),['index',_col]].explode(_col).to_csv(
    f"../data/external/combined_whois_data_{_col}_test.csv")
    
    file1 = f"../data/external/combined_whois_data_{_col}.csv"
    file2 = f"../data/external/combined_whois_data_{_col}_train.csv"
    file3 = f"../data/external/combined_whois_data_{_col}_test.csv"
    
    !wc -l $file1 $file2 $file3

    6367 ../data/external/combined_whois_data_country.csv
    5513 ../data/external/combined_whois_data_country_train.csv
    1690 ../data/external/combined_whois_data_country_test.csv
   13570 total
    9319 ../data/external/combined_whois_data_emails.csv
    8091 ../data/external/combined_whois_data_emails_train.csv
    2450 ../data/external/combined_whois_data_emails_test.csv
   19860 total
    6343 ../data/external/combined_whois_data_whois_server.csv
    5494 ../data/external/combined_whois_data_whois_server_train.csv
    1684 ../data/external/combined_whois_data_whois_server_test.csv
   13521 total
    6358 ../data/external/combined_whois_data_domain_status.csv
    5508 ../data/external/combined_whois_data_domain_status_train.csv
    1686 ../data/external/combined_whois_data_domain_status_test.csv
   13552 total
    6343 ../data/external/combined_whois_data_registrar.csv
    5494 ../data/external/combined_whois_data_registrar_train.csv
    1684 ../data/external/combined_whois_dat

In [None]:
### Entropy data import

In [30]:
# Read in the file
with open('../data/raw/benign_entropy_data.txt', 'r') as file :
  filedata = file.read()

# Replace the target string
filedata = filedata.replace('{', '')
filedata = filedata.replace('}', '')
filedata = filedata.replace(':', ',')

# Write the file out again
with open('../data/external/benign_entropy_data.txt', 'w') as file:
  file.write(filedata)

entropy_train = []
entropy_test = []

for f in filedata.split("\n"):
    domain = f[1:f.find('"', 1)]
    if domain in X_train['domain'].values:
        entropy_train.append(f)
    else:
        entropy_test.append(f)
        
with open('../data/external/benign_entropy_data_train.txt', 'w') as file:
    for item in entropy_train:
        file.write("%s\n" % item)
        
with open('../data/external/benign_entropy_data_test.txt', 'w') as file:
    for item in entropy_test:
        file.write("%s\n" % item)
        
file1 = '../data/external/benign_entropy_data.txt'
file2 = '../data/external/benign_entropy_data_train.txt'
file3 = '../data/external/benign_entropy_data_test.txt'

!wc -l $file1 $file2 $file3

    4001 ../data/external/benign_entropy_data.txt
    3572 ../data/external/benign_entropy_data_train.txt
     430 ../data/external/benign_entropy_data_test.txt
    8003 total


In [31]:
# Read in the file
with open('../data/raw/malicious_entropy_data.txt', 'r') as file :
  filedata = file.read()

# Replace the target string
filedata = filedata.replace('{', '')
filedata = filedata.replace('}', '')
filedata = filedata.replace(':', ',')

# Write the file out again
with open('../data/external/malicious_entropy_data.txt', 'w') as file:
  file.write(filedata)

entropy_train = []
entropy_test = []

for f in filedata.split("\n"):
    domain = f[1:f.find('"', 1)]
    if domain in X_train['domain'].values:
        entropy_train.append(f)
    else:
        entropy_test.append(f)

with open('../data/external/malicious_entropy_data_train.txt', 'w') as file:
    for item in entropy_train:
        file.write("%s\n" % item)
        
with open('../data/external/malicious_entropy_data_test.txt', 'w') as file:
    for item in entropy_test:
        file.write("%s\n" % item)
        
file1 = '../data/external/malicious_entropy_data.txt'
file2 = '../data/external/malicious_entropy_data_train.txt'
file3 = '../data/external/malicious_entropy_data_test.txt'

!wc -l $file1 $file2 $file3

    2415 ../data/external/malicious_entropy_data.txt
    1941 ../data/external/malicious_entropy_data_train.txt
     475 ../data/external/malicious_entropy_data_test.txt
    4831 total


In [32]:
### Load IP Address Org details

import pandas as pd
import json

b = '../data/raw/benign_ip_data.txt'

records = []
error_lines = []

# load the lines into a list, there are some error lines that needs to be dealt with
with open(b) as f: # open file
    for line in f.read().splitlines(): # for each line
        try:
            obj = json.loads(re.sub('.*}{', '{', line)) # load the line, which looks like: {"domain.name": {...values...}}
            records.append(obj)
        except:
            error_lines.append(line)
        
print(f"Number of line errors: {len(error_lines)}")  
print(f"Number of lines: {len(records)}")

Number of line errors: 0
Number of lines: 6670


In [33]:
b = '../data/raw/malicious_ip_data.txt'

# load the lines into a list, there are some error lines that needs to be dealt with
with open(b) as f: # open file
    for line in f.read().splitlines(): # for each line
        try:
            obj = json.loads(re.sub('.*}{', '{', line)) # load the line, which looks like: {"domain.name": {...values...}}
            records.append(obj)
        except:
            error_lines.append(line)
        
print(f"Number of line errors: {len(error_lines)}")  
print(f"Number of lines: {len(records)}")

Number of line errors: 0
Number of lines: 9085


In [34]:
record_dict = {}
for item in records:
    name = [x for x in item.keys()][0]
    record_dict[name] = item[name]

In [35]:
error_list = []
for x in record_dict:
    if type(record_dict[x]) != dict:
        error_list.append(x)
        
for x in error_list:
    del record_dict[x]

In [36]:
org_df = pd.DataFrame.from_dict(record_dict, orient='index')
org_df.reset_index(inplace=True)

In [37]:
org_df['A_Org'] = org_df['A'].apply(pd.Series)['Org']
org_df['MX_Org'] = org_df['MX'].apply(pd.Series)['Org']

In [38]:
org_df.sample(10)

Unnamed: 0,index,A,MX,A_Org,MX_Org
590,comtsrmatters.com,"{'CC': 'NA', 'Org': 'NA'}","{'CC': 'NA', 'Org': 'NA'}",,
3497,hidakashimpo.co.jp,"{'CC': 'JP', 'Org': 'NA'}","{'CC': 'JP', 'Org': 'IRT-JPNIC-JP'}",,IRT-JPNIC-JP
2353,bocoranhkg.com,"{'CC': 'US', 'Org': 'Cloudflare, Inc.'}","{'CC': 'NA', 'Org': 'NA'}","Cloudflare, Inc.",
5175,fio.linosheart.com,"{'CC': 'NA', 'Org': 'NA'}","{'CC': 'NA', 'Org': 'NA'}",,
1305,piracyproxy.me,"{'CC': 'US', 'Org': 'Cloudflare, Inc.'}","{'CC': 'NA', 'Org': 'NA'}","Cloudflare, Inc.",
3582,stopkill.com,"{'CC': 'VG', 'Org': 'Confluence Networks Inc'}","{'CC': 'NA', 'Org': 'NA'}",Confluence Networks Inc,
2506,fly12go.com,"{'CC': 'FR', 'Org': 'OVH Sp. z o. o.'}","{'CC': 'NA', 'Org': 'NA'}",OVH Sp. z o. o.,
912,teplo.guru,"{'CC': 'DE', 'Org': 'Hetzner Online GmbH - Con...","{'CC': 'DE', 'Org': 'Hetzner Online GmbH - Con...",Hetzner Online GmbH - Contact Role,Hetzner Online GmbH - Contact Role
2315,conversaacademy.in,"{'CC': 'US', 'Org': 'Cloudflare, Inc.'}","{'CC': 'NA', 'Org': 'NA'}","Cloudflare, Inc.",
3016,dzkwjr.icu,"{'CC': 'US', 'Org': 'PSINet, Inc.'}","{'CC': 'NA', 'Org': 'NA'}","PSINet, Inc.",


In [40]:
org_df.loc[org_df['MX_Org']!="NA",["index","MX_Org"]].to_csv("../data/external/combined_mx_org.csv")
org_df.loc[(org_df['MX_Org']!="NA") & (org_df['index'].isin(X_train['domain'])) ,["index","MX_Org"]].to_csv("../data/external/combined_mx_org_train.csv")
org_df.loc[(org_df['MX_Org']!="NA") & (org_df['index'].isin(X_test['domain'])),["index","MX_Org"]].to_csv("../data/external/combined_mx_org_test.csv")

org_df.loc[org_df['A_Org']!="NA",["index","A_Org"]].to_csv("../data/external/combined_A_org.csv")
org_df.loc[(org_df['MX_Org']!="NA") & (org_df['index'].isin(X_train['domain'])),["index","A_Org"]].to_csv("../data/external/combined_A_org_train.csv")
org_df.loc[(org_df['MX_Org']!="NA") & (org_df['index'].isin(X_train['domain'])),["index","A_Org"]].to_csv("../data/external/combined_A_org_test.csv")

file1 = "../data/external/combined_mx_org.csv"
file2 = "../data/external/combined_mx_org_train.csv"
file3 = "../data/external/combined_mx_org_test.csv"

!wc -l $file1 $file2 $file3

file1 = "../data/external/combined_A_org.csv"
file2 = "../data/external/combined_A_org_train.csv"
file3 = "../data/external/combined_A_org_test.csv"

!wc -l $file1 $file2 $file3

    2387 ../data/external/combined_mx_org.csv
    2158 ../data/external/combined_mx_org_train.csv
     713 ../data/external/combined_mx_org_test.csv
    5258 total
    4158 ../data/external/combined_A_org.csv
    2158 ../data/external/combined_A_org_train.csv
    2158 ../data/external/combined_A_org_test.csv
    8474 total


In [41]:
!docker ps

CONTAINER ID   IMAGE                                     COMMAND                  CREATED       STATUS      PORTS                                                                                                                               NAMES
38e46715231e   docker.tigergraph.com/tigergraph:latest   "/bin/sh -c '/usr/sb…"   3 weeks ago   Up 8 days   0.0.0.0:9000->9000/tcp, :::9000->9000/tcp, 0.0.0.0:14240->14240/tcp, :::14240->14240/tcp, 0.0.0.0:14022->22/tcp, :::14022->22/tcp   tigergraph


In [None]:
# To start tigergraph docker
# https://docs.tigergraph.com/tigergraph-server/current/getting-started/docker
#  docker run -d -p 14022:22 -p 9000:9000 -p 14240:14240 --name tigergraph --ulimit nofile=1000000:1000000 -v /Users/nb311848/Documents/mystuff/repos/artemis/data/external:/home/tigergraph/data -t docker.tigergraph.com/tigergraph:latest


In [42]:
conn = tg.TigerGraphConnection()


In [None]:
# print(conn.gsql('CLEAR GRAPH STORE -HARD'))

In [43]:
# Clear the Tigergraph server/graph as we want to reproduce the graph creation from scratch
## CAUTION: this wipes everything inside your TigerGraph container !!

print(conn.gsql('drop all', options=[]))

Dropping all, about 1 minute ...
Abort all active loading jobs
Try to abort all loading jobs on graph Artemis, it may take a while ...
[ABORT_SUCCESS] No active Loading Job to abort.
Resetting GPE...
Successfully reset GPE and GSE
Stopping GPE GSE
Successfully stopped GPE GSE in 0.005 seconds
Clearing graph store...
Successfully cleared graph store
Starting GPE GSE RESTPP
Successfully started GPE GSE RESTPP in 0.363 seconds
Everything is dropped.


In [44]:
# Read in the GSQL code that creates the Graph / Schema / Loading Jobs and Queries on TigerGraph

text_file = open("../src/scripts/DBImportExport_Artemis.gsql", "r")
 
#read whole file to a string
artemis_graph_gsql = text_file.read()
 
#close file
text_file.close()
 
# print(artemis_graph_gsql)

In [45]:
# Run the GSQL script and print the results (Check for any errors)

print(conn.gsql(artemis_graph_gsql, options=[]))

Stopping GPE GSE RESTPP
Successfully stopped GPE GSE RESTPP in 30.236 seconds
Starting GPE GSE RESTPP
Successfully started GPE GSE RESTPP in 0.078 seconds
The graph Artemis is created.
Successfully created schema change jobs: [create_artemis_graph].

Current graph version 0
Trying to add vertex DomainRecord.
Trying to add vertex DomainName.
Trying to add vertex Country.
Trying to add vertex City.
Trying to add vertex Emails.
Trying to add vertex Organisation.
Trying to add vertex Nameserver.
Trying to add vertex Registrar.
Trying to add vertex DomainRecordStatus.
Trying to add vertex WhoisServer.
Trying to add edge DomainRecord_DomainName.
Trying to add edge Country_City.
Trying to add edge DomainRecord_Country.
Trying to add edge DomainRecord_Emails.
Trying to add edge DomainRecord_Organisation.
Trying to add edge DomainRecord_Nameserver.
Trying to add edge DomainRecord_Registrar.
Trying to add edge DomainRecord_DomainRecordStatus.
Trying to add edge DomainRecord_WhoisServer.
Trying t

In [46]:
# set train / test switch
train_test = 'train'

In [47]:
# Generic loading job string

loading_job = """RUN LOADING JOB load_job_whois_data{field1} USING MyDataSource="/home/tigergraph/data/combined_whois_data{field2}.csv\""""

In [48]:
# Execute the loading job string for the listed data files



print(conn.gsql(loading_job.format(field1=f"",field2=f"_{train_test}"),graphname='Artemis', options=[]))

for x in list_of_explode_columns:
    print(conn.gsql(loading_job.format(field1=f"_{x}",field2=f"_{x}_{train_test}"),graphname='Artemis', options=[]))                       

[2A
[2K
[2K
[Tip: Use "CTRL + C" to stop displaying the loading status update, then use "SHOW LOADING STATUS jobid" to track the loading progress again]
[Tip: Manage loading jobs with "ABORT/RESUME LOADING JOB jobid"]
Starting the following job, i.e.
JobName: load_job_whois_data, jobid: Artemis.load_job_whois_data.file.m1.1649836384888
Loading log: '/home/tigergraph/tigergraph/log/restpp/restpp_loader_logs/Artemis/Artemis.load_job_whois_data.file.m1.1649836384888.log'

Job "Artemis.load_job_whois_data.file.m1.1649836384888" loading status
[WAITING] m1 ( Finished: 0 / Total: 0 )
Job "Artemis.load_job_whois_data.file.m1.1649836384888" loading status
[FINISHED] m1 ( Finished: 1 / Total: 1 )
[LOADED]
+-----------------------------------------------------------------------------------------------+
|                                           FILENAME |   LOADED LINES |   AVG SPEED |   DURATION|
|/home/tigergraph/data/combined_whois_data_train.csv |           5494 |     26 kl/s |     0.21 

In [49]:
# Load entropy data
loading_job = f'RUN LOADING JOB load_job_entropy USING MyDataSource="/home/tigergraph/data/benign_entropy_data_{train_test}.txt"'
print(conn.gsql(loading_job, graphname='Artemis', options=[]))

loading_job = f'RUN LOADING JOB load_job_entropy USING MyDataSource="/home/tigergraph/data/malicious_entropy_data_{train_test}.txt"'
print(conn.gsql(loading_job, graphname='Artemis', options=[]))

[2A
[2K
[2K
[Tip: Use "CTRL + C" to stop displaying the loading status update, then use "SHOW LOADING STATUS jobid" to track the loading progress again]
[Tip: Manage loading jobs with "ABORT/RESUME LOADING JOB jobid"]
Starting the following job, i.e.
JobName: load_job_entropy, jobid: Artemis.load_job_entropy.file.m1.1649836430676
Loading log: '/home/tigergraph/tigergraph/log/restpp/restpp_loader_logs/Artemis/Artemis.load_job_entropy.file.m1.1649836430676.log'

Job "Artemis.load_job_entropy.file.m1.1649836430676" loading status
[WAITING] m1 ( Finished: 0 / Total: 0 )
Job "Artemis.load_job_entropy.file.m1.1649836430676" loading status
[FINISHED] m1 ( Finished: 1 / Total: 1 )
[LOADED]
+-----------------------------------------------------------------------------------------------+
|                                           FILENAME |   LOADED LINES |   AVG SPEED |   DURATION|
|/home/tigergraph/data/benign_entropy_data_train.txt |           3572 |     34 kl/s |     0.10 s|
+-----------

In [50]:
# Load MX / A org data
loading_job = f'RUN LOADING JOB load_job_A_org USING MyDataSource="/home/tigergraph/data/combined_A_org_{train_test}.csv"'
print(conn.gsql(loading_job, graphname='Artemis', options=[]))

loading_job = f'RUN LOADING JOB load_job_MX_org USING MyDataSource="/home/tigergraph/data/combined_MX_org_{train_test}.csv"'
print(conn.gsql(loading_job, graphname='Artemis', options=[]))

[2A
[2K
[2K
[Tip: Use "CTRL + C" to stop displaying the loading status update, then use "SHOW LOADING STATUS jobid" to track the loading progress again]
[Tip: Manage loading jobs with "ABORT/RESUME LOADING JOB jobid"]
Starting the following job, i.e.
JobName: load_job_A_org, jobid: Artemis.load_job_A_org.file.m1.1649836460860
Loading log: '/home/tigergraph/tigergraph/log/restpp/restpp_loader_logs/Artemis/Artemis.load_job_A_org.file.m1.1649836460860.log'

Job "Artemis.load_job_A_org.file.m1.1649836460860" loading status
[WAITING] m1 ( Finished: 0 / Total: 0 )
Job "Artemis.load_job_A_org.file.m1.1649836460860" loading status
[FINISHED] m1 ( Finished: 1 / Total: 1 )
[LOADED]
+------------------------------------------------------------------------------------------+
|                                      FILENAME |   LOADED LINES |   AVG SPEED |   DURATION|
|/home/tigergraph/data/combined_A_org_train.csv |           2158 |     10 kl/s |     0.20 s|
+------------------------------------

In [51]:
# Install all queries on TigerGraph (This makes query execution much faster)

print(conn.gsql('INSTALL QUERY ALL', options=[]))

Start installing queries, about 1 minute ...
delete_co_edges query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/delete_co_edges'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.
tg_louvain query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/tg_louvain?v_type=VALUE&e_type=VALUE&[wt_attr=VALUE]&[max_iter=VALUE]&[result_attr=VALUE]&[file_path=VALUE]&[print_info=VALUE]'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.
community_stuff query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/community_stuff'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.
tg_label_prop query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/tg_label_prop?v_type=VALUE&e_type=VALUE&max_iter=VALUE&output_limit=VALUE&[print_accum=VALUE]&[file_path=VALUE]&[attr=VALUE]'. Add -H "Authorization: Bearer TOKEN" if authentication is enabled.
community_features_calc query: curl -X GET 'http://127.0.0.1:9000/query/Artemis/community_features_calc'. Add -H "

In [52]:
conn.graphname = 'Artemis'

In [53]:
# Run co_edge creation

print(conn.runInstalledQuery("community_stuff", timeout = 30000))
print(conn.runInstalledQuery("delete_co_loop_edges", timeout = 30000))

[]
[]


In [54]:
# run label propogation

# tg_label_prop (SET<STRING> v_type, SET<STRING> e_type, INT max_iter, INT output_limit, 
#  BOOL print_accum = TRUE, STRING file_path = "", STRING attr = "") 

# have to use query strings until pyTigerGrpah allows lists for sets

params = "v_type=DomainRecord&e_type=co_registrar&e_type=c_org&e_type=co_nameserver&max_iter=10000&output_limit=0&print_accum=1&attr=community"

result = conn.runInstalledQuery("tg_label_prop", params=params, timeout = 30000)

In [55]:
# Extract the community features

result = conn.runInstalledQuery("community_features_calc", timeout = 30000)
community_features = conn.runInstalledQuery("community_features_calc", timeout = 30000)


In [56]:
community_features[0]['(@@group_entropy_final)']

[{'community': 25165892,
  'min_entropy': 2.75,
  'max_entropy': 3.02717,
  'avg_entropy': 2.90829,
  'domain_count': 3,
  'malicious_ratio': 0},
 {'community': 5242919,
  'min_entropy': 1.92193,
  'max_entropy': 3.12193,
  'avg_entropy': 2.62975,
  'domain_count': 8,
  'malicious_ratio': 0.375},
 {'community': 4194365,
  'min_entropy': 2.5,
  'max_entropy': 3.53422,
  'avg_entropy': 3.15224,
  'domain_count': 4,
  'malicious_ratio': 0},
 {'community': 20971672,
  'min_entropy': 0.9183,
  'max_entropy': 3.09307,
  'avg_entropy': 2.29476,
  'domain_count': 3,
  'malicious_ratio': 0},
 {'community': 6291494,
  'min_entropy': 1.92193,
  'max_entropy': 3.39275,
  'avg_entropy': 2.80343,
  'domain_count': 5,
  'malicious_ratio': 0},
 {'community': 10485777,
  'min_entropy': 2.52164,
  'max_entropy': 3.65376,
  'avg_entropy': 3.17485,
  'domain_count': 6,
  'malicious_ratio': 0},
 {'community': 4194452,
  'min_entropy': 2.40564,
  'max_entropy': 3.50689,
  'avg_entropy': 2.99722,
  'domain_c

In [57]:
community_features_df = pd.DataFrame(community_features[0]['(@@group_entropy_final)'])

In [58]:
community_features_df.sample(3)

Unnamed: 0,community,min_entropy,max_entropy,avg_entropy,domain_count,malicious_ratio
2,4194365,2.5,3.53422,3.15224,4,0.0
143,22020112,1.9183,3.5,2.93286,13,0.0
35,2097163,2.0,2.9183,2.47998,3,0.0


In [59]:
community_features_df.to_csv("../data/processed/community_features.csv")

In [60]:
DomainRecordsGSQL = """INTERPRET QUERY () FOR GRAPH Artemis {
   t = select dr from DomainRecord:dr;
   print(t);
}"""

domain_records = conn.runInterpretedQuery(DomainRecordsGSQL)


In [61]:
domain_records[0]['(t)']

[{'v_id': 'libbyslibrary.com',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'libbyslibrary.com',
   'dnssec': 'unsigned',
   'name': 'Registration Private',
   'entropy': 2.71929,
   'MaliciousFlag': False,
   'community': 20971573}},
 {'v_id': 'inventrade.ru',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'inventrade.ru',
   'dnssec': '',
   'name': '',
   'entropy': 2.92193,
   'MaliciousFlag': False,
   'community': 29360186}},
 {'v_id': 'nichido-garo.co.jp',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'nichido-garo.co.jp',
   'dnssec': '',
   'name': '',
   'entropy': 3.25163,
   'MaliciousFlag': False,
   'community': -1}},
 {'v_id': 'louboutin-shoes.us',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'louboutin-shoes.us',
   'dnssec': '',
   'name': '',
   'entropy': 3.32323,
   'MaliciousFlag': False,
   'community': 126}},
 {'v_id': 'mswordcoverpages.com',
  'v_type': 'DomainRecord',
  'attributes': {'id': 'mswordcoverpages.com',
   'dnssec': 'unsigned',
 

In [62]:
dr_dict = {}

for x in domain_records[0]['(t)']:
    dr_dict[x['v_id']] = x['attributes']['community']

In [63]:
dr_dict

{'libbyslibrary.com': 20971573,
 'inventrade.ru': 29360186,
 'nichido-garo.co.jp': -1,
 'louboutin-shoes.us': 126,
 'mswordcoverpages.com': 20971573,
 'tgs-toulouse.fr': 28311563,
 'naturpark-zillertal.at': -1,
 'blogsabda.com': 25165956,
 'nido.de': -1,
 'lessavonsdejoya.com': 12583028,
 'lendify.se': -1,
 'visitwinterhaven.info': 20971573,
 'loganmathieu.com': 20971573,
 'wwwguppies.com': 19922955,
 'maghrebtransport.com': 19922955,
 'momsdreamjob.com': 19922955,
 'nwataxidermy.com': 23068694,
 'zjkonline.com': -1,
 'ribeirinhas.com': 9437192,
 'g311.cn': -1,
 'sscfeo0.icu': 20971590,
 'zkftkg.icu': 20971590,
 'jjcqdx.icu': 20971590,
 'lvssn.com': 1048623,
 'teuniz.net': 23068766,
 'petspolicy.us': 31457395,
 'nextcaller.com': 20971573,
 'cheapjerseysnflshop.co': 18874451,
 'newturfers.com': 20971541,
 'sdodo.com': 1048605,
 'sibregionservice.ru': 22020127,
 'ziilabs.com': 18874451,
 'myriad.net': 23068823,
 'yapc.eu': -1,
 'steptools.com': 19922978,
 'clrhome.org': 31457395,
 'sport

In [64]:
domain_record_df = pd.DataFrame.from_dict(dr_dict, orient='index').reset_index()
domain_record_df.columns = ['DomainRecord','community']

In [65]:
domain_record_df.sample(3)

Unnamed: 0,DomainRecord,community
2481,maxcijhgfdfhj.xyz,18874451
3691,albertocerriteno.com,20971573
1299,gaudela.net,7340032


In [66]:
graph_features_df = domain_record_df.merge(community_features_df, how = 'left', on='community')

In [67]:
graph_features_df.sample(10)

Unnamed: 0,DomainRecord,community,min_entropy,max_entropy,avg_entropy,domain_count,malicious_ratio
359,shdonline.ru,25165892,2.75,3.02717,2.90829,3,0.0
3823,micsem.org,20971573,0.9183,3.87314,3.00456,681,0.02056
727,legend4000.duckdns.org,25165883,1.58496,3.91613,3.06634,51,0.43137
1477,machinadynamica.com,19922978,0.9183,3.88018,2.88184,171,0.0117
3310,p2jnahbccw.shop,18874451,1.79248,3.77356,3.15129,148,0.75
2121,l46t3vgvmtx5wxe6.onion,-1,0.0,4.02895,2.8266,1020,0.31961
3443,whatthekpop.com,30408712,1.79248,3.91613,2.96982,52,0.05769
2708,biz-logo.com,20971573,0.9183,3.87314,3.00456,681,0.02056
2916,ladypanard.com,6291482,2.25163,3.32486,2.87236,13,0.0
3903,211103bei.top,20971590,1.79248,3.32486,2.50353,135,0.07407


In [70]:
graph_features_df.to_csv('../data/processed/graph_community_features_train.csv', index=False)

In [69]:
len(community_features_df)

145

In [None]:
# Load in test data and assume community label and community features (do not redo communities)