In [303]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd
from joblib import dump
import numpy as np

from src.scripts.artemis_data import get_ns_cols, get_email_cols
from src.scripts.dga.dga_functions import dga_prediction

In [304]:
file = "../data/processed/final_whois_data.csv"
df = pd.read_csv(file)


In [305]:
df.columns

Index(['domain', 'redacted', 'name_servers_1', 'name_servers_2',
       'name_servers_3', 'name_servers_4', 'registrant_contact_name',
       'registrar', 'updated_date', 'country', 'dnssec', 'emails', 'org',
       'state', 'whois_server', 'address', 'city', 'emails_1', 'emails_2',
       'emails_3', 'expiration_date', 'name', 'zipcode', 'creation_date',
       'name_servers_5', 'name_servers_6', 'name_servers_7', 'entropy',
       'dns_rec_a_cc', 'dns_rec_a_org', 'dns_rec_mx_cc', 'dns_rec_mx_org',
       'malicious', 'community', 'min_entropy', 'max_entropy', 'avg_entropy',
       'domain_count', 'malicious_ratio', 'days_between_creations',
       'days_since_creation', 'days_between_updates', 'days_since_update',
       'days_until_expiration', 'has_multiple_domain_names',
       'multiple_domain_names_match', 'number_name_servers',
       'num_different_ns_domains', 'main_name_server_domain',
       'serverDeleteProhibited', 'clientDeleteProhibited',
       'serverRenewProhibited',

In [306]:
label = 'malicious'
labels = df[label].values
data_df = df.drop(columns=label)

In [307]:
redo = False
if redo:
    dga_probs = []
    count = 0
    for domain, entropy in zip(data_df.domain.tolist(), data_df.entropy.tolist()):
        count += 1
        dga_probs.append(dga_prediction(domain, entropy))
        if count % 10 == 0:
            print(count, end=" ")
    data_df['dga_probability'] = dga_probs
    dga_df = data_df[['domain', 'dga_probability']]
    dga_df.to_csv('dga_probabilities.csv', index=False)
else:
    dga_probs = pd.read_csv('dga_probabilities.csv')
    data_df = data_df.merge(dga_probs, on='domain')

In [308]:
data_df

Unnamed: 0,domain,redacted,name_servers_1,name_servers_2,name_servers_3,name_servers_4,registrant_contact_name,registrar,updated_date,country,...,main_name_server_domain,serverDeleteProhibited,clientDeleteProhibited,serverRenewProhibited,clientRenewProhibited,clientTransferProhibited,num_emails,email_domains,num_email_domains,dga_probability
0,griffithspartners.com.au,0.0,T.AU,R.AU,Q.AU,S.AU,1,afilias australia pty ltd,2022-03-17,zz,...,t,1,0,1,0,0,0,email_nan,1,0.000000
1,21fss.com,0.0,NS33.DOMAINCONTROL.COM,NS34.DOMAINCONTROL.COM,,,0,"godaddy.com, llc",2021-01-19,us,...,domaincontrol,0,1,0,1,1,1,godaddy.com,1,0.000000
2,conniedunndesigns.com,0.0,NS1.DREAMHOST.COM,NS2.DREAMHOST.COM,NS3.DREAMHOST.COM,,0,dreamhost,2021-08-09,us,...,dreamhost,0,0,0,0,1,4,dreamhost.com;proxy.dreamhost.com,2,0.020000
3,corporatespending.com,0.0,NS1.P23.DYNECT.NET,NS2.P23.DYNECT.NET,NS3.P23.DYNECT.NET,NS4.P23.DYNECT.NET,0,"godaddy.com, llc",2020-11-08,us,...,dynect,0,1,0,1,1,1,godaddy.com,1,0.000000
4,eraliving.com,0.0,NS65.WORLDNIC.COM,NS66.WORLDNIC.COM,,,0,"network solutions, llc",2018-03-07,us,...,worldnic,0,0,0,0,1,3,eraliving.com;web.com,2,0.010000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5633,ewakyc72.top,1.0,a.dnspod.com,b.dnspod.com,A.DNSPOD.COM,B.DNSPOD.COM,0,"nicenic international group co., limited",2021-08-04,ru,...,dnspod,0,1,0,0,1,1,nicenic.net,1,0.186667
5634,ewazqx71.top,1.0,a.dnspod.com,b.dnspod.com,A.DNSPOD.COM,B.DNSPOD.COM,0,"nicenic international group co., limited",2021-08-04,ru,...,dnspod,0,1,0,0,1,1,nicenic.net,1,0.570000
5635,rebornx.duckdns.org,1.0,NS1.DUCKDNS.ORG,NS2.DUCKDNS.ORG,NS3.DUCKDNS.ORG,NS4.DUCKDNS.ORG,0,gandi sas,2021-11-26,fr,...,duckdns,0,0,0,0,1,2,contact.gandi.net;support.gandi.net,2,0.000000
5636,mail.dipiluminacion.com,0.0,NS1.MYHOSTINGPACK.COM,NS2.MYHOSTINGPACK.COM,ns1.myhostingpack.com,ns2.myhostingpack.com,0,akky online solutions s.a. de c.v.,2021-07-26,mx,...,myhostingpack,0,0,0,0,1,3,akky.mx,1,0.000000


In [309]:
data_df.columns

Index(['domain', 'redacted', 'name_servers_1', 'name_servers_2',
       'name_servers_3', 'name_servers_4', 'registrant_contact_name',
       'registrar', 'updated_date', 'country', 'dnssec', 'emails', 'org',
       'state', 'whois_server', 'address', 'city', 'emails_1', 'emails_2',
       'emails_3', 'expiration_date', 'name', 'zipcode', 'creation_date',
       'name_servers_5', 'name_servers_6', 'name_servers_7', 'entropy',
       'dns_rec_a_cc', 'dns_rec_a_org', 'dns_rec_mx_cc', 'dns_rec_mx_org',
       'community', 'min_entropy', 'max_entropy', 'avg_entropy',
       'domain_count', 'malicious_ratio', 'days_between_creations',
       'days_since_creation', 'days_between_updates', 'days_since_update',
       'days_until_expiration', 'has_multiple_domain_names',
       'multiple_domain_names_match', 'number_name_servers',
       'num_different_ns_domains', 'main_name_server_domain',
       'serverDeleteProhibited', 'clientDeleteProhibited',
       'serverRenewProhibited', 'clientRenew

In [310]:
data_df = data_df.drop(columns=get_ns_cols(7))
data_df = data_df.drop(columns=get_email_cols(3))
data_df = data_df.drop(columns=['domain', 'updated_date', 'expiration_date', 'creation_date', 'days_since_creation'])
data_df = data_df.drop(columns=['min_entropy', 'max_entropy', 'avg_entropy', 'domain_count', 'malicious_ratio'])

In [311]:
country_cols = ['country', 'dns_rec_a_cc', 'dns_rec_mx_cc']
countries_list = []
for col in country_cols:
    countries_list += data_df[col].tolist()
country_encoder = LabelEncoder().fit(countries_list)
for col in country_cols:
    data_df[col] = country_encoder.transform(data_df[col])

In [312]:
cols_requiring_encoding = [
    'registrar', 'dnssec', 'org', 'state', 'whois_server', 'address', 'city', 'name',
    'zipcode', 'dns_rec_a_org', 'dns_rec_mx_org', 'main_name_server_domain', 'email_domains',
]
encoder_dict = {}
for col in cols_requiring_encoding:
    enc = LabelEncoder().fit(data_df[col])
    encoder_dict[col] = enc
    data_df[col] = enc.transform(data_df[col])


In [313]:
data_df = data_df.fillna(-1)

In [314]:
X_train, X_test, y_train, y_test = train_test_split(data_df, labels, random_state=0)

In [315]:
X_train

Unnamed: 0,redacted,registrant_contact_name,registrar,country,dnssec,org,state,whois_server,address,city,...,main_name_server_domain,serverDeleteProhibited,clientDeleteProhibited,serverRenewProhibited,clientRenewProhibited,clientTransferProhibited,num_emails,email_domains,num_email_domains,dga_probability
3784,0.0,0,255,52,8,720,84,131,256,216,...,619,0,0,0,0,1,2,591,2,0.006000
3196,0.0,0,349,112,0,681,413,218,191,43,...,832,0,0,0,0,0,0,174,1,0.000000
1569,0.0,0,255,52,8,720,84,131,256,216,...,503,0,0,0,0,1,2,591,2,0.000000
4729,0.0,0,399,102,8,849,42,141,368,250,...,284,0,0,0,0,1,2,567,1,0.540000
3424,0.0,0,207,112,0,681,413,218,191,43,...,396,0,0,0,0,0,0,174,1,0.011929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4931,0.0,0,347,11,8,725,323,173,384,176,...,614,0,0,0,0,1,2,453,2,1.000000
3264,1.0,0,173,102,8,326,32,101,224,258,...,802,0,1,0,1,1,1,232,1,0.010000
1653,1.0,0,301,86,8,724,283,158,347,209,...,391,0,0,0,0,0,1,368,1,0.257143
2607,0.0,0,29,112,8,681,413,8,191,43,...,658,0,0,0,0,0,1,357,1,0.050000


In [316]:
clf = RandomForestClassifier(
    criterion='entropy', random_state=0, max_depth=5, min_samples_split=3, min_samples_leaf=2, class_weight='balanced'
).fit(X_train, y_train)

In [317]:
clf.score(X_test, y_test)

0.9163120567375886

In [318]:
model_loc = "../models/"
model_name = "rfc.joblib"
encoder_name = "enc_dict.joblib"
country_encoder_name = "country_encoder.joblib"

In [319]:
dump(clf, model_loc+model_name)

['../models/rfc.joblib']

In [320]:
dump(encoder_dict, model_loc+encoder_name)

['../models/enc_dict.joblib']

In [321]:
dump(country_encoder, model_loc+country_encoder_name)

['../models/country_encoder.joblib']

In [322]:
preds = clf.predict(X_test)

In [323]:
pred_df = X_test.copy()
pred_df['malicious'] = y_test
pred_df['prediction'] = preds
pred_df['correct'] = np.where(pred_df.malicious==pred_df.prediction, 1, 0)

In [324]:
pred_df.correct.sum()

1292

In [325]:
pred_df.shape

(1410, 39)

In [326]:
pred_df[pred_df.malicious==1].correct.sum()

471

In [327]:
pred_df[pred_df.malicious==1].shape

(515, 39)

In [328]:
pred_df

Unnamed: 0,redacted,registrant_contact_name,registrar,country,dnssec,org,state,whois_server,address,city,...,serverRenewProhibited,clientRenewProhibited,clientTransferProhibited,num_emails,email_domains,num_email_domains,dga_probability,malicious,prediction,correct
1505,1.0,0,192,73,8,333,413,178,347,209,...,0,0,1,1,468,1,0.030000,0,0,1
134,0.0,0,173,102,8,105,32,101,191,43,...,0,1,1,1,232,1,0.000000,0,0,1
4253,1.0,0,264,102,8,796,42,136,347,209,...,0,0,1,1,336,1,0.990000,1,1,1
2846,0.0,0,447,112,8,681,413,218,191,43,...,0,0,0,1,214,1,0.666667,0,0,1
2531,0.0,0,173,102,8,297,338,101,191,43,...,0,1,1,1,232,1,0.010000,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,0.0,0,173,102,8,681,82,101,191,43,...,0,1,1,1,232,1,0.016000,0,0,1
3970,1.0,0,287,86,8,99,284,149,347,209,...,0,0,1,1,355,1,0.020000,1,1,1
1921,0.0,0,318,102,8,717,258,171,11,33,...,0,0,1,2,433,2,0.000000,0,0,1
658,0.0,0,225,112,0,681,413,218,191,43,...,0,0,0,0,174,1,0.000000,0,0,1


In [329]:
for f,v in zip(clf.feature_names_in_, clf.feature_importances_):
    print(f,v)

redacted 0.019022665536681437
registrant_contact_name 6.86389040167945e-06
registrar 0.09428450505372549
country 0.01837365008871329
dnssec 0.0019174413476922528
org 0.02064475639880002
state 0.01875610907725221
whois_server 0.06967028101403085
address 0.008974227672438115
city 0.02677915363271166
name 0.010777493502183386
zipcode 0.016398048456127343
entropy 0.012754342325869097
dns_rec_a_cc 0.015592931778474046
dns_rec_a_org 0.06681356331180652
dns_rec_mx_cc 0.06161034764952866
dns_rec_mx_org 0.18104110499892484
community 0.013715833803555321
days_between_creations 0.0022428126817823023
days_between_updates 0.004726594789685111
days_since_update 0.03845408924145966
days_until_expiration 0.03720411570062436
has_multiple_domain_names 0.0010472399982071699
multiple_domain_names_match 0.0007621381410937143
number_name_servers 0.004955746202311755
num_different_ns_domains 0.0015241303037719291
main_name_server_domain 0.012442542117381628
serverDeleteProhibited 1.5855073978325663e-18
clien