# IoT Project

In [253]:
import numpy as np
import pandas as pd
from collections import defaultdict
from IPy import IP as IPy
import pprint
from sklearn import preprocessing
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import warnings
import csv

warnings.filterwarnings('ignore')

from dask.distributed import Client, progress
import dask.dataframe as dd
client = Client('127.0.0.1:8786')
client

0,1
Client  Scheduler: tcp://127.0.0.1:8786  Dashboard: http://127.0.0.1:8787,Cluster  Workers: 4  Cores: 16  Memory: 68.72 GB


#### Internet IP address <-> Organization mapping

In [196]:
ASN_df = pd.read_csv("./ip2asn-combined.tsv", sep='\t', header=None)
ASN_df.columns = ['start','end','asn','country','organization']

# remove all the "Not routed" rows
ASN_df = ASN_df[ASN_df['organization'] != 'Not routed']

# add numerical representation for the start and end IP range for faster org search later
ASN_df['start.dec'] = ASN_df['start'].apply(lambda x: float(IPy(x).strDec()))
ASN_df['end.dec'] = ASN_df['end'].apply(lambda x: float(IPy(x).strDec()))

## Device labeling  

Makes the labels for the devices to be used later for supervised training and test sets.

In [198]:
# baseline 
tshark_cmd = "-T fields -E header=y -e frame.number -e frame.time -e frame.len -e frame.protocols"

# ethernet layer
#tshark_cmd += " -e eth.src_resolved -e eth.dst_resolved"
tshark_cmd += " -e eth.src -e eth.dst -e eth.dst_resolved"

#add IP/TCP/UDP/ICMP layers
tshark_cmd += " -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport"

# add DSN / mDNS layer
tshark_cmd += " -e dns.qry.name -e dns.resp.name -e dns.cname -e dns.a"

# add HTTP layer
tshark_cmd += " -e http.request.method -e http.request.uri -e http.user_agent -e http.host"

# add SSL certificate layer
tshark_cmd += " -e x509sat.printableString -e x509sat.uTF8String"

In [199]:
#!tshark -tud -N m -r ./packet_capture.pcap {tshark_cmd} > daghan.csv
#!tshark -tud -N m -r ./IoT_Trafﬁc_UNSW_Sydney/train_large_2.pcap {tshark_cmd} > packets_train_large_2.csv

## Load the packet data

In [308]:
#df = pd.read_csv("./packets_train_large_2.csv", sep='\t')
df = pd.read_csv("./packets_train_1M.csv", sep='\t')
#df = pd.read_csv("./daghan.csv", sep='\t')
#df = df.append(df2)

## Load known devices labels

In [309]:
OUI_df = pd.read_csv('oui.csv')
#known_devices = pd.read_csv('known_devices.csv')
known_devices = pd.read_csv('sydney_devices.csv')



known_devices['OUI'] = known_devices['MAC address'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
known_devices['Manufacturer Device Type'] = ([' '.join(row) for row in 
                        zip(known_devices["Manufacturer"],known_devices["Device Type"])])
known_devices['MAC address'] = known_devices['MAC address'].apply(lambda x: x.strip())

In [310]:
#df = pd.read_csv("./packets_train_large_2.csv", sep='\t')
df = pd.read_csv("./packets_train_1M.csv", sep='\t')
#df = pd.read_csv("./daghan.csv", sep='\t')
#df = df.append(df2)

In [311]:
def determineZone(ip):
    """
    This function determines if the IP 
    address is internal or public 
    according to RFC1918
    """    
    if pd.notna(ip):
        return IPy(ip.split(',')[0]).iptype()
    else:
        return ip

In [312]:
def protoLang(line):
    proto = line['protocol'] 
    size = "size: " + str(line['frame.len'])
    message = ""
    
    try:
        
        ########################################
        ### Extract base line protocol       ###
        ########################################
        
        #UDP based?
        if line['frame.protocols'].find('udp') > 0:     
            src =  " ".join(['device:',line['org.src'], 'identifier:', str(line['tail.src'])]) if line['zone.src'] == 'PRIVATE' \
                else " ".join(['service:',line['org.src']])
            srcport = 'port: ' + str(int(line['udp.srcport']))
            dst =  "" if pd.isna(line['org.dst']) \
                    else (" ".join(['device:', line['org.dst'],'identifier:', str(line['tail.dst'])]) if line['zone.dst'] == 'PRIVATE' \
                            else (" ".join(['service:',line['org.src']]) if line['zone.dst'] == 'PUBLIC' else ''))
            dstport = 'port: ' + str(int(line['udp.dstport']))
        
        #UDP based?
        if line['frame.protocols'].find('tcp') > 0:     
            src =  " ".join(['device:',line['org.src'], 'identifier:', str(line['tail.src'])]) if line['zone.src'] == 'PRIVATE' \
                else " ".join(['service:',line['org.src']])
            srcport = 'port: ' + str(int(line['tcp.srcport']))
            dst =  "" if pd.isna(line['org.dst']) \
                    else (" ".join(['device:', line['org.dst'],'identifier:', str(line['tail.dst'])]) if line['zone.dst'] == 'PRIVATE' \
                            else (" ".join(['service:',line['org.src']]) if line['zone.dst'] == 'PUBLIC' else ''))
            dstport = 'port: ' + str(int(line['tcp.dstport']))
        
        ## ICMPv6
        ## TODO: There is more information to be extracted here
        elif proto == 'icmpv6':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join(['device',src,proto,size]).lower()
        
        
        #IP (L3) that is neither of those
        elif line['frame.protocols'].find('ip') > 0:
            src =  " ".join(['device', line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else " ".join(['service', line['org.src']])
            dst =  "" if pd.isna(line['org.dst']) \
                    else (" ".join(['device', line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                            else (" ".join(['service', line['org.src']]) if line['zone.dst'] == 'PUBLIC' else ''))
        
        #L2
        elif line['frame.protocols'].find('ethertype') > 0:
            src =  " ".join(['device',line['org.src'],line['tail.src']]) 
            dst =  " ".join(['device', line['org.dst'],line['tail.dst']]) if pd.notna(line['org.dst']) else ''
        
        #this should very rarely happen, if at all!
        else:
            print("undetected protocol(1): {}".format(line))
            return line['frame.protocols'].lower()
        
        
        
        ########################################
        ### Extract higher layer protocol    ###
        ### and additional meta data         ###
        ########################################        
         # x509ce (certificate exchange)
        if line['frame.protocols'].find('x509') > 0:
            proto = "protocol: " + proto
            x509ce_message = ''
            if pd.notna(line['x509sat.printableString']):
                x509ce_message += line['x509sat.printableString']
            if pd.notna(line['x509sat.uTF8String']):
                x509ce_message += line['x509sat.uTF8String']
            return " ".join([src,srcport,proto,size,dst,dstport,x509ce_message]).lower()
        
        ## ssl 
        elif proto == 'ssl':
            proto = "protocol: " + proto
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
                
        
        ## tcp:data
        elif line['frame.protocols'].find('tcp:data') > 0:
            proto = "protocol: tcp:data"
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## tcp:stun
        ## TODO: stun.att.software "stun.att.realm": "belkin.org",
        elif line['frame.protocols'].find('tcp:stun') > 0:
            proto = 'protocol: tcp:stun'
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## upd:data
        ## TODO: Detect broadcasts
        elif line['frame.protocols'].find('udp:data') > 0:
            proto = 'protocol: udp:data'
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## udp:nbns (netbios)
        ## TODO: nbns.name": "MACBOOKAIR-7040<00> (Workstation\/Redirector)",
        elif line['frame.protocols'].find('udp:nbns') > 0:
            proto = 'protocol: udp:nbns'
            return " ".join([src,srcport,proto,size]).lower()
        
        ## upd:bootp (dhcp)
        ## TODO: "bootp.option.hostname": "amazon-c4475da2a"
        ## TODO: "bootp.type": "2" (1 is request, 2 is reply)
        ## if it is a reply, add the dst IP address too
        elif line['frame.protocols'].find('udp:bootp') > 0:
            proto = 'protocol: udp:bootp'
            return " ".join([src,srcport,proto,size]).lower()
        
        ## udP:gquic
        ## TODO: "gquic.tag.sni": "0.docs.google.com"
        ## TODO: "gquic.tag.uaid": "Chrome\/65.0.3325.181 Intel Mac OS X 10_13_3" 
        ## TODO: "gquic.tag": "CHLO" (client hello)
        elif line['frame.protocols'].find('udp:quic') > 0:
            proto = 'protocol: udp:gquic'
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## ssdp (simple service discovery protocol)
        ## TODO: http.server: "Linux UPnP\/1.0 Sonos\/41.3-50131 (ZPS12)
        ## TODO: http.unknown_header: "HOUSEHOLD.SMARTSPEAKER.AUDIO: Sonos_hOcMvZ0JBvDVZz7BXZc5ILQAT5.Cd7MOjIUy3HWHWEXItIZ\\r\\n",
        ## TODO: http.request.full_uri: "http:\/\/239.255.255.250:1900*",
        elif proto == 'ssdp':
            proto = "protocol: " + proto
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## db-lsp-disc:json (Dropbox Lan sync Discovery Protocol)
        ## TODO: Detect and use Broadcast
        ## TODO: eth.addr_resolved": "Broadcast"
        elif line['frame.protocols'].find('db-lsp-disc:json') > 0:
            proto = 'protocol: db-lsp-disc'
            return " ".join([src,srcport,proto,size,dstport]).lower()
        
        ## ntp (network time protocol)
        elif proto == 'ntp':
            proto = "protocol: " + proto
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## STP (spanning tree protocol)
        elif proto == 'stp':
            proto = "protocol: " + proto
            return " ".join([src,proto,size]).lower()
        
        ## mdns
        elif proto == 'mdns':
            proto = "protocol: " + proto
            mdns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')    
            return " ".join([src,srcport,proto,size,mdns_string]).lower()
        
        ## igmp
        ##TODO: look into IGMP
        elif proto == 'igmp':
            proto = "protocol: " + proto
            return " ".join([src,proto,dst]).lower()
        
        ## dns
        elif proto == 'dns':
            proto = "protocol: " + proto
            dns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')
            return " ".join([src,srcport,proto,size,dst,dstport,dns_string]).lower()

        ## http 
        elif proto == 'http':
            proto = "protocol: " + proto
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            return " ".join([src,srcport,proto,size,dst,dstport,payload]).lower()
        
        ## http:data
        ## TODO: add http.file_data content
        elif line['frame.protocols'].find('http:data') > 0:
            proto = 'protocol: http:data'
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            return " ".join([src,srcport,proto,size,dst,dstport,payload]).lower()
        
        ## http:media
        ## TODO: add  http.content_type , http.content_length
        elif line['frame.protocols'].find('http:media') > 0:
            proto = 'protocol: http:media'
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        
        ## icmp:data
        ## TODO: Consider adding icmp.type to the frame / protocol language
        ## TODO: Detect broadcast
        elif line['frame.protocols'].find('icmp:data') > 0:
            proto = 'protocol: icmp:data'
            return " ".join([src,proto,size,dst]).lower()
        
        elif proto == 'arp':
            #return " ".join([src,proto,size,dst])
            return "".lower()
        
        ## ethertype:data
        elif line['frame.protocols'].find('ethertype:data') > 0:
            proto = 'protocol: ethertype:data'
            return " ".join([src,proto,size]).lower()
        
        # undetected protocols
        else:
            #is it based on UDP or TCP?
            proto = "protocol: " + proto
            if line['frame.protocols'].find('udp') +\
                line['frame.protocols'].find('tcp') > 0:
                return " ".join([src,srcport,proto,size,dst,dstport]).lower()
            elif line['frame.protocols'].find('ip')> 0:
                return " ".join([src,proto,size,dst]).lower()
            elif line['frame.protocols'].find('ethertype')> 0:
                return " ".join([src,proto,size,dst]).lower()
            else:
                print("undetected protocol(2): {}".format(str(line)))
                return " ".join([src,proto,size,dst]).lower()
            
    except ValueError:
        print('Exception!!')
        print(line)

In [313]:
def preprocessDF(df):
    df['oui.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
    df['tail.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))
    df['oui.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
    df['tail.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))
    
    #create dask frame
    df = dd.from_pandas(df, npartitions=8)
    
    #add OUI
    df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.src',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
    df = df.rename(columns={'Organization Name':'org.src'})

    df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.dst',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
    df = df.rename(columns={'Organization Name':'org.dst'})
    
    # let's get rid of white spaces
    df['org.src'] = df['org.src'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x, meta = str)
    df['org.dst'] = df['org.dst'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x, meta = str)
    
    # clean up org and tail for dst columns
    df['org.dst'] = df['org.dst'].fillna('')
    df['tail.dst'] = df[['org.dst','tail.dst','eth.dst_resolved']].apply(lambda row: row['tail.dst'] if (row['org.dst'] != '') \
                                                                     else row['eth.dst_resolved'], axis = 1, meta = str)
    # adding private (local) and public (internet) traffic zones
    df['zone.src'] = df['ip.src'].apply(lambda x: determineZone(x), meta = str)
    df['zone.dst'] = df['ip.dst'].apply(lambda x: determineZone(x), meta = str)
    
    # clean up IP addresses that have multiple source or dest values
    df['ip.src'] = df['ip.src'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x, meta = str)
    df['ip.dst'] = df['ip.dst'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x, meta = str)
    
    # the communication protocol column
    df['protocol'] = df['frame.protocols'].apply(lambda x: x.split(':')[-1], meta = str)
    
    # create a list of public IP addresses that we need to resolve
    public_IPs = set()
    public_IPs_orgs = {}
    for index,row in df.iterrows():
        if (row['zone.src'] == "PUBLIC"):
            public_IPs.add(row['ip.src'])
        if (row['zone.dst'] == "PUBLIC"):
            public_IPs.add(row['ip.dst'])
    
    # let's look up these public IP addresses in our db and replace with the service name
    for ip in public_IPs:
        try:
            public_IPs_orgs[ip] = ASN_df[(ASN_df['start.dec'] <= float(IPy(ip).strDec())) & 
                                    (ASN_df['end.dec'] >= float(IPy(ip).strDec()))]['organization']\
                                    .values[0].split(' ')[0]
        except:
            #print(ip)
            public_IPs_orgs[ip] = 'UNLISTED'

    # let's replace org data for public IPs based on the ASN information
    df['org.src'] = df[['zone.src','ip.src','org.src']].apply(lambda x: ( public_IPs_orgs[x['ip.src']] if x['zone.src'] == "PUBLIC" else x['org.src']), axis=1, meta = str)
    df['org.dst'] = df[['zone.dst','ip.dst','org.dst']].apply(lambda x: ( public_IPs_orgs[x['ip.dst']] if x['zone.dst'] == "PUBLIC" else x['org.dst']), axis=1, meta = str)
   
    #df['org.src'] = df['org.src'].fillna("UNKNOWN")
    #df['tail.src'] = df['tail.src'].fillna("UNKNOWN")
    #df['org.dst'] = df['org.dst'].fillna("UNKNOWN")
    #df['tail.dst'] = df['tail.dst'].fillna("UNKNOWN")
    
     # let's create a sentence for each packet
    df['sentence'] = df.apply(lambda line: protoLang(line), axis=1, meta = str)
    
    
    # Aggregating these sentences for each device is non-trivial. 
    # More can be found here: 
    # https://docs.google.com/spreadsheets/d/1UCClhmFm9nZc6VUr5XV_rMtQ90C3vgFb4nJ6DG2iF_g
    
    # Or just read on! 
    
    # Source   Destination
    # Private  Private   => L3: Local device to local device (this one is actually very tricky, refer to the link above)
    # Private  Public    => L3: Local device to Internet
    # Public   Private   => L3: Internet to local device, register for the eth.dst
    # Public   Public    => Shouldn't be possible, ignore
    # NaN      whatever  => If the source device has no public or private zone (therefore NaN), that means it doesn't have 
    #                       an IP address, which means it is almost certainly layer 2 (ethernet traffic)
    #                       L2: Local device to local device, register for eth.src         

    # 3 aggregation
    # 1- source = Private (local to local and local to Internet, register for eth.src)
    # 2- source = Public  (Internet to local, register for the eth.dst)
    # 3- source = NA  (L2, the local device can be the receiver or sender, register for eth.src)


    
    df_agg1 = df[(df['zone.src'] == 'PRIVATE') & (df['sentence'] != '')][['eth.src','sentence']]
    df_agg2 = df[(df['zone.src'] == 'PUBLIC')  & (df['sentence'] != '')][['eth.dst','sentence']]
    df_agg2.columns = ['eth.src','sentence']
    df_agg3 = df[(df['zone.src'] != 'PRIVATE') & (df['zone.src'] != 'PUBLIC') & (df['sentence'] != '')][['eth.src','sentence']]

    final_agg = df_agg1.append(df_agg2)#.append(df_agg3)
    return final_agg.append(df_agg3)
    #return df_agg3




In [None]:
final_agg_df = preprocessDF(df)

In [None]:
final_agg_df.compute().to_csv('step2.csv')

In [None]:
#ind.index
print(len(final_agg_df))
final_agg_df.head()

In [None]:
final_agg_df = final_agg_df.merge(known_devices[list(['MAC address','Manufacturer Device Type'])], 
             left_on = 'eth.src',  right_on= 'MAC address', how = 'inner').drop(['MAC address'], axis = 1)

In [None]:
#print(len(final_merged_df))
print(len(final_agg_df))
final_agg_df.head()

In [None]:
final_agg_df.compute().to_csv("syndey_test_full2.csv")

In [None]:
final_agg_df['Manufacturer Device Type'].compute().value_counts()

In [None]:
total_df = final_agg_df[['Manufacturer Device Type', 'sentence']].compute()
#total_df.columns = ['label', 'sentence']

In [None]:
le = preprocessing.LabelEncoder()
le.fit(total_df['Manufacturer Device Type'])
total_df['label'] = le.transform(total_df['Manufacturer Device Type'])
total_df = total_df.drop(['Manufacturer Device Type'], axis=1)
#total_df['label'] = total_df['label'].apply(lambda x: '__label__' + str(x))

In [None]:
total_df[total_df['label'] == '__label__2']

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(total_df, test_size=0.2)

In [None]:
train.to_csv('fast_train7.txt', header=None, index=None, sep=' ')
test.to_csv('fast_test7.txt', header=None, index=None, sep=' ')

In [None]:
!head -n 1 fast_train7.txt

In [None]:
!../fastText/fasttext supervised -input fast_train7.txt -output model7 -label  __label__

In [None]:
!../fastText/fasttext test model7.bin fast_test7.txt

In [None]:
!head -n 1 fast_test7.txt

In [None]:
label_df = pd.read_csv('fast_test7.txt', header = None, sep=' ')

In [None]:
label_df.head()

In [None]:
from collections import defaultdict
#label_df['check'] = test[0]
label_df['check'] = test['label'].values

In [None]:
len(test)

In [None]:
label_df = label_df.replace({'__label__':''}, regex=True)

In [None]:
label_df.head()

In [None]:
label_df['conf'] = label_df['check'] + ' ' + label_df[0]

In [None]:
label_df['conf'].head()

In [None]:
label_df['conf'].value_counts()

In [306]:
label_df[label_df['conf'] == '26 26'].head()

Unnamed: 0,0,1,check,conf


In [165]:
test[1].iloc[167]

'Belkin-International-Inc 832811 49153 tcp 66 Belkin-International-Inc 79F489 3256'

In [166]:
test[1].iloc[520]

'Belkin-International-Inc 832811 49153 tcp 66 Belkin-International-Inc 79F489 4941'

In [168]:
test[1].iloc[1]

'Belkin-International-Inc 832811 4800 tcp 74 Belkin-International-Inc 79F489 49153'

In [170]:
test[1].iloc[23]

'Belkin-International-Inc 79F489 49153 tcp 66 Belkin-International-Inc 832811 3555'