# Set Up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import re

from scipy.stats import skew, chi2_contingency
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.impute import SimpleImputer

pd.set_option('display.max_columns', 15)

In [2]:
# Read the main file
main_df = pd.read_csv('dga_data_small.csv')
main_df.head()

Unnamed: 0,isDGA,domain,host,subclass
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,cryptolocker
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,gameoverdga
2,dga,thenrest,thenrest.net,nivdort
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,gameoverdga
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,gameoverdga


In [3]:
main_df['tld'] = main_df['host'].str.split('.', n=1).str[1]

main_df['cctld'] = main_df['tld'].str.split('.', n=1).str[1].fillna('None') # Code country top level domain

main_df

main_df = main_df.rename(columns = {'domain': 'subdomain'})

In [4]:
def is_ascii_domain(subdomain):
    ascii_pattern = re.compile(r'^[a-zA-Z0-9.-]+$')
    return int(bool(ascii_pattern.match(subdomain)))
    
main_df['ascii'] = main_df['subdomain'].apply(is_ascii_domain)

# Domain length
main_df['subdomain_len'] = main_df['subdomain'].str.len()
main_df['host_len'] = main_df['host'].str.len()

In [5]:
main_df

Unnamed: 0,isDGA,subdomain,host,subclass,tld,cctld,ascii,subdomain_len,host_len
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,cryptolocker,co.uk,uk,1,13,19
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,gameoverdga,org,,1,26,30
2,dga,thenrest,thenrest.net,nivdort,net,,1,8,12
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,gameoverdga,org,,1,26,30
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,gameoverdga,net,,1,24,28
...,...,...,...,...,...,...,...,...,...
1995,legit,88ha,88ha.com,alexa,com,,1,4,8
1996,legit,grooby,grooby.com,alexa,com,,1,6,10
1997,legit,51zzl,51zzl.com,alexa,com,,1,5,9
1998,legit,index-education,index-education.com,legit,com,,1,15,19


In [6]:
# Vowel count
main_df['subdomain_vowel_count'] = main_df['subdomain'].str.lower().str.count(r'[aeoiu]')

# Consonant count
main_df['subdomain_consonant_count'] = main_df['subdomain_len'] - main_df['subdomain_vowel_count']

# Has Numeric - boolean to int type
main_df['has_num'] = main_df['subdomain'].str.contains(r'\d').astype(int)

In [7]:
def legitToNumber(isdga):
    if isdga == 'legit':
        return 1
    else: return 0

main_df['isDGA_N'] = main_df['isDGA'].apply(legitToNumber)

main_df['digitCount'] = main_df['subdomain'].str.count(r'\d')

main_df.head()

Unnamed: 0,isDGA,subdomain,host,subclass,tld,cctld,ascii,subdomain_len,host_len,subdomain_vowel_count,subdomain_consonant_count,has_num,isDGA_N,digitCount
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,cryptolocker,co.uk,uk,1,13,19,1,12,0,0,0
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,gameoverdga,org,,1,26,30,4,22,1,0,10
2,dga,thenrest,thenrest.net,nivdort,net,,1,8,12,2,6,0,0,0
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,gameoverdga,org,,1,26,30,5,21,1,0,7
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,gameoverdga,net,,1,24,28,6,18,1,0,9


In [8]:
OH_encoder = OneHotEncoder(sparse_output=False)

In [9]:
encoded = OH_encoder.fit_transform(np.array(main_df['subclass']).reshape(-1, 1))
encoded_features = pd.DataFrame(encoded, columns=OH_encoder.get_feature_names_out(['subclass']))

main_df = main_df.join(encoded_features)

main_df.head()

Unnamed: 0,isDGA,subdomain,host,subclass,tld,cctld,ascii,...,subclass_cryptolocker,subclass_gameoverdga,subclass_goz,subclass_legit,subclass_necurs,subclass_newgoz,subclass_nivdort
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,cryptolocker,co.uk,uk,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,gameoverdga,org,,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,dga,thenrest,thenrest.net,nivdort,net,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,gameoverdga,org,,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,gameoverdga,net,,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [10]:
domain_cross_check = pd.crosstab(main_df['subclass'],main_df['isDGA_N'])
domain_cross_check

isDGA_N,0,1
subclass,Unnamed: 1_level_1,Unnamed: 2_level_1
alexa,0,501
bamital,18,0
cryptolocker,462,0
gameoverdga,105,0
goz,77,0
legit,0,499
necurs,111,0
newgoz,119,0
nivdort,108,0


In [11]:
# Check if subdomain starting with digit
main_df['startW/Digit'] = main_df['subdomain'].str.match(r'^\d').astype(int)

# EDA

In [12]:
main_df

Unnamed: 0,isDGA,subdomain,host,subclass,tld,cctld,ascii,...,subclass_gameoverdga,subclass_goz,subclass_legit,subclass_necurs,subclass_newgoz,subclass_nivdort,startW/Digit
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,cryptolocker,co.uk,uk,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,gameoverdga,org,,1,...,1.0,0.0,0.0,0.0,0.0,0.0,1
2,dga,thenrest,thenrest.net,nivdort,net,,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,gameoverdga,org,,1,...,1.0,0.0,0.0,0.0,0.0,0.0,1
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,gameoverdga,net,,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,legit,88ha,88ha.com,alexa,com,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1
1996,legit,grooby,grooby.com,alexa,com,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0
1997,legit,51zzl,51zzl.com,alexa,com,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1
1998,legit,index-education,index-education.com,legit,com,,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0


In [13]:
for col in main_df.columns:
    print(f'Unique values for {col}: {main_df[col].unique()}\n')

Unique values for isDGA: ['dga' 'legit']

Unique values for subdomain: ['tyopcrkqgxcfm' '72j5rn1l9mzleo6203v1ogenfl' 'thenrest' ... '51zzl'
 'index-education' 'fastpics']

Unique values for host: ['tyopcrkqgxcfm.co.uk' '72j5rn1l9mzleo6203v1ogenfl.org' 'thenrest.net' ...
 '51zzl.com' 'index-education.com' 'fastpics.us']

Unique values for subclass: ['cryptolocker' 'gameoverdga' 'nivdort' 'necurs' 'newgoz' 'goz' 'bamital'
 'alexa' 'legit']

Unique values for tld: ['co.uk' 'org' 'net' 'ru' 'nf' 'ir' 'biz' 'info' 'bit' 'com' 'bz' 'in'
 'tv' 'kz' 'cx' 'ga' 'ms' 'ki' 'jp' 'sh' 'pro' 'eu' 'la' 'tw' 'mn' 'to'
 'ug' 'xxx' 'us' 'sx' 'ac' 'de' 'im' 'cm' 'co' 'so' 'sc' 'mx' 'su' 'nu'
 'cc' 'com.br' 'io' 'it' 'com.tr' 'be' 'pl' 'gr' 'com.au' 'cl' 'tk'
 'co.id' 'fr' 'nl' 'ch' 'dk' 'hu' 'ua' 'lt' 'gov.tw' 'pe' 'lv' 'com.tw'
 'com.cn' 'ca' 'ba' 'ie' 'ro' 'co.kr' 'vn' 'co.jp' 'pt' 'cn' 'me' 'org.br'
 'at' 'gov.br' 'edu' 'tn' 'blog.br' 'presse.fr' 'net.cn' 'fi' 'am' 'az'
 'ph' 'blogspot.com' 'hr' 'hk' '

In [14]:
legit_df = main_df[(main_df['isDGA']=='legit')].reset_index().drop(columns='index')

legit_df

Unnamed: 0,isDGA,subdomain,host,subclass,tld,cctld,ascii,...,subclass_gameoverdga,subclass_goz,subclass_legit,subclass_necurs,subclass_newgoz,subclass_nivdort,startW/Digit
0,legit,teacherspayteachers,teacherspayteachers.com,alexa,com,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0
1,legit,animespirit,animespirit.ru,alexa,ru,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0
2,legit,pyramidcollection,pyramidcollection.com,legit,com,,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0
3,legit,callingcardconnect,callingcardconnect.com,legit,com,,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0
4,legit,undertonevideo,undertonevideo.com,legit,com,,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,legit,88ha,88ha.com,alexa,com,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1
996,legit,grooby,grooby.com,alexa,com,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0
997,legit,51zzl,51zzl.com,alexa,com,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1
998,legit,index-education,index-education.com,legit,com,,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0


In [20]:
legit_df['tld'].unique()

array(['com', 'ru', 'com.br', 'io', 'ir', 'net', 'it', 'info', 'com.tr',
       'be', 'org', 'pl', 'gr', 'com.au', 'cl', 'cc', 'tk', 'co.id', 'de',
       'fr', 'nl', 'ch', 'dk', 'hu', 'ua', 'lt', 'jp', 'in', 'co',
       'gov.tw', 'kz', 'co.uk', 'pe', 'lv', 'com.tw', 'com.cn', 'ca',
       'ba', 'ie', 'tv', 'mx', 'xxx', 'ro', 'co.kr', 'vn', 'co.jp', 'pt',
       'cn', 'me', 'org.br', 'at', 'tw', 'eu', 'gov.br', 'edu', 'tn',
       'blog.br', 'presse.fr', 'net.cn', 'fi', 'am', 'az', 'ph', 'nu',
       'blogspot.com', 'hr', 'hk', 'is', 'mu', 'fm', 'us', 'edu.sa',
       'gov', 'no', 'co.il', 'gob.ar', 'com.my', 'com.mx', 'com.ar',
       'asia', 'do', 'cz', 'co.in', 'gouv.fr', 'jus.br', 'se', 'biz'],
      dtype=object)

In [15]:
dga_df = main_df[(main_df['isDGA']!='legit')].reset_index().drop(columns='index')

dga_df

Unnamed: 0,isDGA,subdomain,host,subclass,tld,cctld,ascii,...,subclass_gameoverdga,subclass_goz,subclass_legit,subclass_necurs,subclass_newgoz,subclass_nivdort,startW/Digit
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,cryptolocker,co.uk,uk,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,gameoverdga,org,,1,...,1.0,0.0,0.0,0.0,0.0,0.0,1
2,dga,thenrest,thenrest.net,nivdort,net,,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,gameoverdga,org,,1,...,1.0,0.0,0.0,0.0,0.0,0.0,1
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,gameoverdga,net,,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,dga,jlljsxwrfkys,jlljsxwrfkys.ru,cryptolocker,ru,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0
996,dga,maudmjvij,maudmjvij.xxx,necurs,xxx,,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0
997,dga,lllndsiljokku,lllndsiljokku.ru,cryptolocker,ru,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0
998,dga,septemberfish,septemberfish.net,nivdort,net,,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0


In [16]:
legit_df['subdomain_len'].mean()

10.021

In [17]:
dga_df['subdomain_len'].mean()

16.998

# Sample

In [18]:
import nltk #Natural Language Toolkit
from nltk.corpus import words

nltk.download('words')

english_words = set(words.words())

def is_composed_of_english_words(subdomain):
    possible_words = []
    found_words = []
    
    for i in range(len(subdomain)-1):
        for j in range(i+1, len(subdomain)-1):
            possible_words.append(subdomain[i:j])

    for word in possible_words:
        if len(word) >= 2 and word in english_words:
            found_words.append(word)
    
    return found_words

sample = 'teacherspayteachers'
print(is_composed_of_english_words(sample))

['te', 'tea', 'teach', 'teache', 'teacher', 'ea', 'each', 'ach', 'ache', 'acher', 'che', 'he', 'her', 'hers', 'er', 'ers', 'spa', 'spay', 'pa', 'pay', 'ay', 'te', 'tea', 'teach', 'teache', 'ea', 'each', 'ach', 'ache', 'che', 'he']


[nltk_data] Downloading package words to /Users/ezishr/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [19]:

# Download the words corpus if you haven't already

# Get the set of valid English words
english_words = set(words.words())

# List of words to check
words_to_check = ['ach', 'ache', 'acher', 'che']

# Check each word for validity
validity = {word: (word in english_words) for word in words_to_check}
print(validity)

{'ach': True, 'ache': True, 'acher': True, 'che': True}
