In [99]:
import pandas as pd
import src.scripts.artemis as artemis
from time import sleep
import numpy as np

In [100]:
def replace_quotes(val):
    return val.replace('"', '')

m_df = pd.read_csv("../data/external/threatfox_malicious_domains.csv")
m_df['domain'] = m_df.ioc_value.apply(replace_quotes)
m_df['malicious'] = 1

In [101]:
b_df = pd.read_csv("../data/raw/majestic_million_small.csv")
b_df = b_df.rename(columns={'Domain':'domain'})
b_df['malicious'] = 0

In [102]:
cols = ['domain', 'malicious']
domains_df = pd.concat([b_df[cols], m_df[cols]], ignore_index=True)

In [103]:
domains_df

Unnamed: 0,domain,malicious
0,google.com,0
1,facebook.com,0
2,youtube.com,0
3,twitter.com,0
4,instagram.com,0
...,...,...
16866,retechnolodgy.top,1
16867,rpoznahu.top,1
16868,aluditos.top,1
16869,cosmokosmo.best,1


In [104]:
test_df = domains_df.sample(100, random_state=5)

In [105]:
test_df.head()

Unnamed: 0,domain,malicious
157,loc.gov,0
6694,sznews.com,0
6677,ng.ru,0
11503,harringtonsavingss.com,1
8095,freeforums.net,0


In [106]:
test_df.malicious.value_counts()

0    60
1    40
Name: malicious, dtype: int64

In [107]:
domains = test_df.domain.tolist()
y = test_df.malicious.tolist()
zipped = zip(domains, y)

In [108]:
results = []
with open('testing_clf.csv', 'w') as f:
    for domain, y in zipped:
        try:
            prediction = artemis.predict(domain, True)
        except:
            f.write('domain,y,prediction')
        results.append(prediction)
        print("\n\n\n")
        sleep(0.5)


Received WHOIS data for loc.gov: 
{
  "domain_name": "LOC.GOV",
  "registrar": null,
  "whois_server": null,
  "referral_url": null,
  "updated_date": null,
  "creation_date": null,
  "expiration_date": null,
  "name_servers": null,
  "status": "ACTIVE",
  "emails": "security@loc.gov",
  "dnssec": null,
  "name": null,
  "org": null,
  "address": null,
  "city": null,
  "state": null,
  "zipcode": null,
  "country": null
}
loc.gov: {'A': {'IP': '104.16.54.16', 'Count': 2}, 'MX': {'IP': '140.147.31.2', 'Count': 4}}
Received IP addresses of type <class 'dict'> for loc.gov.
Checking record type: A for {'A': {'IP': '104.16.54.16', 'Count': 2}, 'MX': {'IP': '140.147.31.2', 'Count': 4}}
104.16.54.16 not NA!
Current Results: {'CC': 'US', 'Org': 'Cloudflare, Inc.'}
Checking record type: MX for {'A': {'IP': '104.16.54.16', 'Count': 2}, 'MX': {'IP': '140.147.31.2', 'Count': 4}}
140.147.31.2 not NA!
Current Results: {'CC': 'US', 'Org': 'Library of Congress'}
Received DNS data of <class 'dict'>for

In [109]:
test_df['prediction'] = results

In [110]:
test_df['correct'] = np.where(test_df.malicious==test_df.prediction, 1, 0)

In [111]:
test_df

Unnamed: 0,domain,malicious,prediction,correct
157,loc.gov,0,0.0,1
6694,sznews.com,0,0.0,1
6677,ng.ru,0,0.0,1
11503,harringtonsavingss.com,1,1.0,1
8095,freeforums.net,0,0.0,1
...,...,...,...,...
14690,site19.primonet.co.uk,1,0.0,0
10703,campoeroca.bond,1,1.0,1
14768,dev.zemp.com,1,0.0,0
16519,wheelformforsu.top,1,0.0,0


In [117]:
val = test_df.correct.sum()
print(f"Artemis is correctly identifying {val} out of 100 domains as either malicious or benign")



Artemis is correctly identifying 72 out of 100 domains as either malicious or benign


In [118]:
mal_right = test_df[test_df.malicious==1].correct.sum()
mal_total = test_df.malicious.sum()
print(f"Artemis correctly identified {mal_right} out of {mal_total} malicious domains.")

Artemis correctly identified 13 out of 40 malicious domains.


In [119]:
test_df[test_df.malicious==1]

Unnamed: 0,domain,malicious,prediction,correct
11503,harringtonsavingss.com,1,1.0,1
10154,yourgroceries.top,1,1.0,1
14253,wccb.co.zw,1,0.0,0
15323,journalijdrresearchmarket.net,1,,0
14413,app3.maintorna.com,1,1.0,1
15455,mobilegenie.pk,1,0.0,0
14744,projetagro.com,1,0.0,0
11791,www.syklkgepvimdqafwcyyf.com,1,1.0,1
12520,waitingdate.com,1,1.0,1
13997,parkinihol.top,1,1.0,1
