In [2]:
import numpy as np
import pandas as pd 
from sklearn import preprocessing as skp 
from sklearn import decomposition as skd
import utils as ut

In [7]:
# For this example, we will use a modified set of data from UNSW Logs generated by Bro/Zeke. 
# For reference, we will just read the logs for a day and see what a raw dns log looks like 
raw_df = pd.read_json('unsw/day_logs/dns.log', lines=True)
print(f'The Columns are {raw_df.columns.values}')

# Let us also see what the entries look like. 
raw_df.head()

The Columns are ['ts' 'uid' 'id.orig_h' 'id.orig_p' 'id.resp_h' 'id.resp_p' 'proto'
 'trans_id' 'rtt' 'query' 'qclass' 'qclass_name' 'qtype' 'qtype_name'
 'rcode' 'rcode_name' 'AA' 'TC' 'RD' 'RA' 'Z' 'answers' 'TTLs' 'rejected']


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,trans_id,rtt,query,...,rcode,rcode_name,AA,TC,RD,RA,Z,answers,TTLs,rejected
0,1474726000.0,Cmq5BD4abGJE9M6Dn6,192.168.1.166,45136,192.168.1.1,53,udp,31287,0.284601,babyws.withings.net,...,0.0,NOERROR,False,False,True,True,0,[89.30.121.150],[600.0],False
1,1474726000.0,CqS2E81hhnbbAo7xH9,192.168.1.120,37616,192.168.1.1,53,udp,379,0.000366,sip.invoxia.com,...,0.0,NOERROR,False,False,True,True,0,[46.105.38.79],[602.0],False
2,1474726000.0,CLGxUt3vdwASMqHn66,192.168.1.196,4355,192.168.1.1,53,udp,159,0.004368,pool.ntp.org,...,0.0,NOERROR,False,False,True,True,0,"[27.124.125.250, 129.250.35.251, 202.60.94.15,...","[56.0, 56.0, 56.0, 56.0]",False
3,1474726000.0,C3R8kh48F3GhWB5y4c,192.168.1.249,43171,192.168.1.1,53,udp,16295,0.021244,www.samsungsmartcam.com,...,0.0,NOERROR,False,False,True,True,0,"[www.samsungsmartcam.com.edgekey.net, e6081.b....","[60.0, 4631.0, 20.0]",False
4,1474726000.0,C3R8kh48F3GhWB5y4c,192.168.1.249,43171,192.168.1.1,53,udp,16295,0.000324,www.samsungsmartcam.com,...,0.0,NOERROR,False,False,True,True,0,"[www.samsungsmartcam.com.edgekey.net, e6081.b....","[55.0, 4626.0, 15.0]",False


In [9]:
#For illustration of Open Set Problem, we have simplifies the data set 
# The simplified data set consits of only the source IP address, the time-stamp and the DNS name queries. 
# These are saved into a test training dataset and a training dataset. 
# The ground truth of which Source IP address maps belongs to which Vendor is stored in a CSV file. 

#Let us read all the three files. We will convert the ground_df into a directory with key of Source IP address
# and the value being the Vendor 

train_df = pd.read_csv('unsw/openset_dns_train.csv')
test_df = pd.read_csv('unsw/openset_dns_test.csv')
ground_df = pd.read_csv('unsw/ground_truth.csv')
ground_dict=ground_df[['SrcIPAddress','Vendor']].set_index('SrcIPAddress')['Vendor'].to_dict()

In [12]:
# Let us define some constants so we don't have to worry about mistyping 
SRC_IP='id.orig_h'
QUERY='query'

In [16]:
#Let us get all the IP addresses from the training set, and the test set, and print their Vendors 
def print_devices(df,truth_dict):
    keys = set(df[SRC_IP].to_list())
    for key in keys:
        print(f'{key} : {truth_dict[key]}')

def get_vendors(df, truth_dict):
    keys = set(df[SRC_IP].to_list())
    return([truth_dict[key] for key in keys])


In [14]:
# Print the set of devices in the training set
print_devices(train_df, ground_dict)

192.168.1.241 : Netatmo
192.168.1.227 : TP-Link
192.168.1.249 : Samsung
192.168.1.166 : Withings
192.168.1.143 : TP-Link
192.168.1.240 : Amazon


In [15]:
# Print the set of devices in the test set
print_devices(test_df, ground_dict)

192.168.1.238 : Withings
192.168.1.168 : Google
192.168.1.120 : Triby
192.168.1.227 : TP-Link
192.168.1.112 : Netatmo
192.168.1.166 : Withings
192.168.1.177 : PixStar


In [19]:
# See which devices are in the test set but not in the data set 
train_vendors = get_vendors(train_df, ground_dict)
test_vendors = get_vendors(test_df, ground_dict)
print(f'Vendors in training data set {train_vendors}')
print(f'Vendors in testing data set {train_vendors}')
print(f'Vendors in test data not in training data set: {[x for x in test_vendors if x not in train_vendors ]}')
print(f'Vendors in common training data set: {[x for x in test_vendors if x  in train_vendors ]}')

Vendors in training data set ['Netatmo', 'TP-Link', 'Samsung', 'Withings', 'TP-Link', 'Amazon']
Vendors in testing data set ['Netatmo', 'TP-Link', 'Samsung', 'Withings', 'TP-Link', 'Amazon']
Vendors in test data not in training data set: ['Google', 'Triby', 'PixStar']
Vendors in common training data set: ['Withings', 'TP-Link', 'Netatmo', 'Withings']


In [21]:
# Now we will create a pipeline to train on DNS names, 
# This pipeline uses TFIDF Vectorizer over a document of DNS names to predict the vendors 

#The get_XY converts all DNS queries made by the device into a document containing the names queries

def get_XY(df, ground_truth):
    doc_df = df.groupby(['id.orig_h'])['query'].apply(list).reset_index(name='X') 
    queries = doc_df['X'].to_list()
    X = [','.join(x) for x in queries]
    keys = doc_df['id.orig_h'].to_list()
    Y = [ground_truth[x] for x in keys]
    return X,Y

from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.tree as skt 
from sklearn.pipeline import Pipeline

def create_pipeline(classifier): 
    myfunctions = [('PreProcess', TfidfVectorizer()), ('Classifier', classifier )]
    pipeline = Pipeline(myfunctions)
    return pipeline

pipeline = create_pipeline(skt.DecisionTreeClassifier())

In [22]:
# Now train the pipeline on testing data set and see how it performs: 
train_X,train_Y=get_XY(train_df, ground_dict)
pipeline.fit(train_X,train_Y)

test_X, test_Y = get_XY(test_df, ground_dict) 
pred = pipeline.predict(test_X)
print(pred)
print(test_Y)

['Withings' 'Withings' 'Withings' 'Withings' 'Withings' 'TP-Link'
 'Withings']
['Netatmo', 'Triby', 'Withings', 'Google', 'PixStar', 'TP-Link', 'Withings']


In [None]:
# As you can see, the prediction for Vendors in test data not in training data set: ['Google', 'Triby', 'PixStar']
# is not very good. 