# Set Up

In [31]:
import seaborn as sns
import numpy as np
import pandas as pd
import re

In [32]:
# Read the main file
main_df = pd.read_csv('dga_data_small1.csv')

In [33]:
pattern = r'\.(.+)'
host = 'tyopcrkqgxcfm.co.uk'
match = re.search(pattern, host)
print(match.group(1))

co.uk


In [34]:
# Extract the top level domain from host to a new column

def extract_tld(host): # tld = top level domain
    pattern = r'\.(.+)'
    match = re.search(pattern, host)
    if match:
        return match.group(1)
    else:
        return None

main_df['tld'] = main_df['host'].apply(extract_tld)

main_df['cctld'] = main_df['tld'].apply(extract_tld) # Code country top level domain

main_df = main_df.rename(columns = {'domain': 'subdomain'})

In [36]:
def is_ascii_domain(subdomain):
    ascii_pattern = re.compile(r'^[a-zA-Z0-9.-]+$')
    return int(bool(ascii_pattern.match(subdomain)))
    
main_df['ascii'] = main_df['subdomain'].apply(is_ascii_domain)

# Domain length
main_df['subdomain_len'] = main_df['subdomain'].str.len()
main_df['host_len'] = main_df['host'].str.len()

In [39]:
# Vowel count
main_df['subdomain_vowel_count'] = main_df['subdomain'].str.lower().str.count(r'[aeoiu]')

# Consonant count
main_df['subdomain_consonant_count'] = main_df['subdomain'].str.lower().str.count(r'[bcdfghjklmnpqrstvwxyz]')

# Has Numeric - boolean to int type
main_df['has_num'] = main_df['subdomain'].str.contains(r'\d').astype(int)

# EDA

In [47]:
main_df

Unnamed: 0,isDGA,subdomain,host,subclass,tld,cctld,ascii,subdomain_len,host_len,subdomain_vowel_count,subdomain_consonant_count,has_num
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,cryptolocker,co.uk,uk,1,13,19,1,12,0
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,gameoverdga,org,,1,26,30,4,12,1
2,dga,thenrest,thenrest.net,nivdort,net,,1,8,12,2,6,0
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,gameoverdga,org,,1,26,30,5,14,1
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,gameoverdga,net,,1,24,28,6,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,legit,88ha,88ha.com,alexa,com,,1,4,8,1,1,1
1996,legit,grooby,grooby.com,alexa,com,,1,6,10,2,4,0
1997,legit,51zzl,51zzl.com,alexa,com,,1,5,9,0,3,1
1998,legit,index-education,index-education.com,legit,com,,1,15,19,7,7,0


In [48]:
for col in main_df.columns:
    print(f'Unique values for {col}: {main_df[col].unique()}\n')

Unique values for isDGA: ['dga' 'legit']

Unique values for subdomain: ['tyopcrkqgxcfm' '72j5rn1l9mzleo6203v1ogenfl' 'thenrest' ... '51zzl'
 'index-education' 'fastpics']

Unique values for host: ['tyopcrkqgxcfm.co.uk' '72j5rn1l9mzleo6203v1ogenfl.org' 'thenrest.net' ...
 '51zzl.com' 'index-education.com' 'fastpics.us']

Unique values for subclass: ['cryptolocker' 'gameoverdga' 'nivdort' 'necurs' 'newgoz' 'goz' 'bamital'
 'alexa' 'legit']

Unique values for tld: ['co.uk' 'org' 'net' 'ru' 'nf' 'ir' 'biz' 'info' 'bit' 'com' 'bz' 'in'
 'tv' 'kz' 'cx' 'ga' 'ms' 'ki' 'jp' 'sh' 'pro' 'eu' 'la' 'tw' 'mn' 'to'
 'ug' 'xxx' 'us' 'sx' 'ac' 'de' 'im' 'cm' 'co' 'so' 'sc' 'mx' 'su' 'nu'
 'cc' 'com.br' 'io' 'it' 'com.tr' 'be' 'pl' 'gr' 'com.au' 'cl' 'tk'
 'co.id' 'fr' 'nl' 'ch' 'dk' 'hu' 'ua' 'lt' 'gov.tw' 'pe' 'lv' 'com.tw'
 'com.cn' 'ca' 'ba' 'ie' 'ro' 'co.kr' 'vn' 'co.jp' 'pt' 'cn' 'me' 'org.br'
 'at' 'gov.br' 'edu' 'tn' 'blog.br' 'presse.fr' 'net.cn' 'fi' 'am' 'az'
 'ph' 'blogspot.com' 'hr' 'hk' '

In [54]:
legit_df = main_df[(main_df['isDGA']=='legit')].reset_index().drop(columns='index')

In [51]:
dga_df = main_df[(main_df['isDGA']!='legit')].reset_index().drop(columns='index')

In [52]:
dga_df

Unnamed: 0,isDGA,subdomain,host,subclass,tld,cctld,ascii,subdomain_len,host_len,subdomain_vowel_count,subdomain_consonant_count,has_num
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,cryptolocker,co.uk,uk,1,13,19,1,12,0
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,gameoverdga,org,,1,26,30,4,12,1
2,dga,thenrest,thenrest.net,nivdort,net,,1,8,12,2,6,0
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,gameoverdga,org,,1,26,30,5,14,1
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,gameoverdga,net,,1,24,28,6,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...
995,dga,jlljsxwrfkys,jlljsxwrfkys.ru,cryptolocker,ru,,1,12,15,0,12,0
996,dga,maudmjvij,maudmjvij.xxx,necurs,xxx,,1,9,13,3,6,0
997,dga,lllndsiljokku,lllndsiljokku.ru,cryptolocker,ru,,1,13,16,3,10,0
998,dga,septemberfish,septemberfish.net,nivdort,net,,1,13,17,4,9,0


In [55]:
legit_df

Unnamed: 0,isDGA,subdomain,host,subclass,tld,cctld,ascii,subdomain_len,host_len,subdomain_vowel_count,subdomain_consonant_count,has_num
0,legit,teacherspayteachers,teacherspayteachers.com,alexa,com,,1,19,23,7,12,0
1,legit,animespirit,animespirit.ru,alexa,ru,,1,11,14,5,6,0
2,legit,pyramidcollection,pyramidcollection.com,legit,com,,1,17,21,6,11,0
3,legit,callingcardconnect,callingcardconnect.com,legit,com,,1,18,22,5,13,0
4,legit,undertonevideo,undertonevideo.com,legit,com,,1,14,18,7,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,legit,88ha,88ha.com,alexa,com,,1,4,8,1,1,1
996,legit,grooby,grooby.com,alexa,com,,1,6,10,2,4,0
997,legit,51zzl,51zzl.com,alexa,com,,1,5,9,0,3,1
998,legit,index-education,index-education.com,legit,com,,1,15,19,7,7,0


In [56]:
legit_df['subdomain_len'].mean()

10.021

In [57]:
dga_df['subdomain_len'].mean()

16.998