In [70]:
import pandas as pd
import src.scripts.artemis as artemis
from time import sleep
import numpy as np

In [71]:
def replace_quotes(val):
    return val.replace('"', '')

m_df = pd.read_csv("../data/external/threatfox_malicious_domains.csv")
m_df['domain'] = m_df.ioc_value.apply(replace_quotes)
m_df['malicious'] = 1

In [72]:
b_df = pd.read_csv("../data/raw/majestic_million_small.csv")
b_df = b_df.rename(columns={'Domain':'domain'})
b_df['malicious'] = 0

In [73]:
cols = ['domain', 'malicious']
domains_df = pd.concat([b_df[cols], m_df[cols]], ignore_index=True)

In [74]:
domains_df

Unnamed: 0,domain,malicious
0,google.com,0
1,facebook.com,0
2,youtube.com,0
3,twitter.com,0
4,instagram.com,0
...,...,...
16866,retechnolodgy.top,1
16867,rpoznahu.top,1
16868,aluditos.top,1
16869,cosmokosmo.best,1


In [86]:
test_df = domains_df.sample(100, random_state=4)

In [87]:
test_df.head()

Unnamed: 0,domain,malicious
6387,gmx.net,0
9732,furusato-tax.jp,0
16118,kosmolitopor.space,1
2901,humanesociety.org,0
5149,u.to,0


In [88]:
test_df.malicious.value_counts()

0    58
1    42
Name: malicious, dtype: int64

In [89]:
domains = test_df.domain.tolist()
y = test_df.malicious.tolist()
zipped = zip(domains, y)

In [90]:
results = []
with open('testing_clf.csv', 'w') as f:
    for domain, y in zipped:
        try:
            prediction = artemis.predict(domain, True)
        except:
            f.write('domain,y,prediction')
        results.append(prediction)
        print("\n\n\n")
        sleep(0.5)


Received WHOIS data for gmx.net: 
{
  "domain_name": [
    "GMX.NET",
    "gmx.net"
  ],
  "registrar": "PSI-USA, Inc. dba Domain Robot",
  "whois_server": "whois.psi-usa.info",
  "referral_url": null,
  "updated_date": [
    "2021-12-09 08:01:34",
    "2022-01-27 10:06:32"
  ],
  "creation_date": "1998-12-09 05:00:00",
  "expiration_date": [
    "2022-12-08 05:00:00",
    "2023-04-07 04:27:45"
  ],
  "name_servers": [
    "NS-GMX.UI-DNS.BIZ",
    "NS-GMX.UI-DNS.COM",
    "NS-GMX.UI-DNS.DE",
    "NS-GMX.UI-DNS.ORG",
    "ns-gmx.ui-dns.org",
    "ns-gmx.ui-dns.de",
    "ns-gmx.ui-dns.com",
    "ns-gmx.ui-dns.biz"
  ],
  "status": [
    "clientTransferProhibited https://icann.org/epp#clientTransferProhibited",
    "clientTransferProhibited https://www.icann.org/epp#clientTransferProhibited"
  ],
  "emails": "domain-abuse@psi-usa.info",
  "dnssec": "signedDelegation",
  "name": "REDACTED FOR PRIVACY",
  "org": "1&1 Mail & Media GmbH",
  "address": "REDACTED FOR PRIVACY",
  "city": "REDACT

In [91]:
test_df['prediction'] = results

In [92]:
test_df['correct'] = np.where(test_df.malicious==test_df.prediction, 1, 0)

In [93]:
test_df

Unnamed: 0,domain,malicious,prediction,correct
6387,gmx.net,0,0.0,1
9732,furusato-tax.jp,0,0.0,1
16118,kosmolitopor.space,1,0.0,0
2901,humanesociety.org,0,0.0,1
5149,u.to,0,0.0,1
...,...,...,...,...
16033,magentoinfo.name,1,,0
3736,winamp.com,0,0.0,1
9932,jschina.com.cn,0,0.0,1
3070,bahn.de,0,0.0,1


In [94]:
test_df.correct.sum()




73

In [95]:
test_df[test_df.malicious==1].correct.sum()

15

In [96]:
test_df.malicious.sum()

42