# Explore here

In [8]:
# Load Libraries
import pandas as pd
import numpy as np
import regex as re
from collections import Counter
from pickle import dump
from pickle import load
import nltk

from sklearn.feature_extraction.text import CountVectorizer

# EDA

In [9]:
df = pd.read_csv('C:/Users/Jorge Payà/Desktop/4Geeks/Final Project/Code/DGA-Detection-project2/data/raw/dga_data_small.csv')
df.head()

Unnamed: 0,isDGA,domain,host,subclass
0,dga,tyopcrkqgxcfm,tyopcrkqgxcfm.co.uk,cryptolocker
1,dga,72j5rn1l9mzleo6203v1ogenfl,72j5rn1l9mzleo6203v1ogenfl.org,gameoverdga
2,dga,thenrest,thenrest.net,nivdort
3,dga,15ihbm71utcnfa8dk1mmgoobl9,15ihbm71utcnfa8dk1mmgoobl9.org,gameoverdga
4,dga,x1d6ou7e7kofk60ayhq74x7e,x1d6ou7e7kofk60ayhq74x7e.net,gameoverdga


In [10]:
# For our first model, we will use the feature named 'host' and the target variable 'isDGA'. In next experiments we will use other features such as subclass for clustering
df.drop(['host', 'subclass'], axis=1, inplace=True)
df.head()

Unnamed: 0,isDGA,domain
0,dga,tyopcrkqgxcfm
1,dga,72j5rn1l9mzleo6203v1ogenfl
2,dga,thenrest
3,dga,15ihbm71utcnfa8dk1mmgoobl9
4,dga,x1d6ou7e7kofk60ayhq74x7e


In [11]:
df['isDGA'] = df['isDGA'].apply(lambda x: 1 if x == 'dga' else 0)
df.head()

Unnamed: 0,isDGA,domain
0,1,tyopcrkqgxcfm
1,1,72j5rn1l9mzleo6203v1ogenfl
2,1,thenrest
3,1,15ihbm71utcnfa8dk1mmgoobl9
4,1,x1d6ou7e7kofk60ayhq74x7e


In [12]:
df['d_length'] = df['domain'].apply(lambda x: len(x))
df.head()

Unnamed: 0,isDGA,domain,d_length
0,1,tyopcrkqgxcfm,13
1,1,72j5rn1l9mzleo6203v1ogenfl,26
2,1,thenrest,8
3,1,15ihbm71utcnfa8dk1mmgoobl9,26
4,1,x1d6ou7e7kofk60ayhq74x7e,24


In [13]:
# function to get the entropy of a domain name
def entropy(domain):
    p, lns = Counter(domain), float(len(domain))
    return -sum( count/lns * np.log2(count/lns) for count in p.values())

df['entropy'] = df['domain'].apply(entropy)
df.head()

Unnamed: 0,isDGA,domain,d_length,entropy
0,1,tyopcrkqgxcfm,13,3.546594
1,1,72j5rn1l9mzleo6203v1ogenfl,26,4.132944
2,1,thenrest,8,2.5
3,1,15ihbm71utcnfa8dk1mmgoobl9,26,4.180833
4,1,x1d6ou7e7kofk60ayhq74x7e,24,3.834963


In [18]:
top_english_words = pd.read_csv('C:/Users/Jorge Payà/Desktop/4Geeks/Final Project/Code/DGA-Detection-project2/data/raw/google-10000-english.txt', header=None, names=['words'])
d = top_english_words
#dump(d, open('C:/Users/Jorge Payà/Desktop/4Geeks/Final Project/Code/DGA-Detection-project2/data/raw/top_english_words.pkl', 'wb'))

In [20]:
import pickle

with open('C:/Users/Jorge Payà/Desktop/4Geeks/Final Project/Code/DGA-Detection-project2/data/raw/top_english_words.pkl', 'rb') as f:
    d = pickle.load(f)

def ngrams(word, n):
    if not isinstance(word, list):
        word = [word]
    if not isinstance(n, list):
        n = [n]
    
    l_ngrams = [w[i:i+curr_n] for w in word for curr_n in n for i in range(0,len(w)-curr_n+1)]
    return l_ngrams

def ngram_feature(domain, d, n):   
    l_ngrams = ngrams(domain, n)
    count_sum = sum(d[ngram] for ngram in l_ngrams if d.get(ngram, 0))
    try:
        feature = count_sum/(len(domain)-n+1)
    except ZeroDivisionError:
        feature = 0
    return feature
    
def average_ngram_feature(l_ngram_feature):
    return sum(l_ngram_feature)/len(l_ngram_feature) if l_ngram_feature else 0

df['ngrams'] = df['domain'].apply(lambda x: average_ngram_feature([ngram_feature(x, d, i) for i in range(1, 4)]))
df.head()

Unnamed: 0,isDGA,domain,d_length,entropy,ngrams
0,1,tyopcrkqgxcfm,13,3.546594,0.0
1,1,72j5rn1l9mzleo6203v1ogenfl,26,4.132944,0.0
2,1,thenrest,8,2.5,0.0
3,1,15ihbm71utcnfa8dk1mmgoobl9,26,4.180833,0.0
4,1,x1d6ou7e7kofk60ayhq74x7e,24,3.834963,0.0
