# IoT Project

In [75]:
import numpy as np
import pandas as pd
from collections import defaultdict
from scapy.all import *
from IPy import IP as IPy
import pprint
from sklearn import preprocessing
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

from dask.distributed import Client, progress
import dask.dataframe as dd
#import dask.bag as db
client = Client('127.0.0.1:8786')
client

0,1
Client  Scheduler: tcp://127.0.0.1:8786  Dashboard: http://127.0.0.1:8787,Cluster  Workers: 4  Cores: 16  Memory: 68.72 GB


#### Internet IP address <-> Organization mapping

In [13]:
ASN_df = pd.read_csv("./ip2asn-combined.tsv", sep='\t', header=None)
ASN_df.columns = ['start','end','asn','country','organization']

# remove all the "Not routed" rows
ASN_df = ASN_df[ASN_df['organization'] != 'Not routed']

# add numerical representation for the start and end IP range for faster org search later
ASN_df['start.dec'] = ASN_df['start'].apply(lambda x: float(IPy(x).strDec()))
ASN_df['end.dec'] = ASN_df['end'].apply(lambda x: float(IPy(x).strDec()))

## Device labeling  
The packet traces we are using come from my home network. I know what these devices are. I am going to import the device information for these devices and merge it with the network traffic dataframe we are using.

In [14]:
OUI_df = pd.read_csv('oui.csv')
known_devices = pd.read_csv('sydney_devices.csv')
known_devices['OUI'] = known_devices['MAC address'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
known_devices['Manufacturer Device Type'] = ([' '.join(row) for row in 
                        zip(known_devices["Manufacturer"],known_devices["Device Type"])])
known_devices['MAC address'] = known_devices['MAC address'].apply(lambda x: x.strip())

In [15]:
# baseline 
tshark_cmd = "-T fields -E header=y -e frame.number -e frame.time -e frame.len -e frame.protocols"

# ethernet layer
#tshark_cmd += " -e eth.src_resolved -e eth.dst_resolved"
tshark_cmd += " -e eth.src -e eth.dst -e eth.dst_resolved"

#add IP/TCP/UDP/ICMP layers
tshark_cmd += " -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport"

# add DSN / mDNS layer
tshark_cmd += " -e dns.qry.name -e dns.resp.name -e dns.cname -e dns.a"

# add HTTP layer
tshark_cmd += " -e http.request.method -e http.request.uri -e http.user_agent -e http.host"

# add SSL certificate layer
tshark_cmd += " -e x509sat.printableString -e x509sat.uTF8String"

In [16]:
#!tshark -tud -N m -r ./packet_capture.pcap {tshark_cmd} > daghan.csv
#!tshark -tud -N m -r ./IoT_Trafﬁc_UNSW_Sydney/train_large_2.pcap {tshark_cmd} > packets_train_large_2.csv

In [17]:
def determineZone(ip):
    """
    This function determines if the IP 
    address is internal or public 
    according to RFC1918
    """    
    if pd.notna(ip):
        return IPy(ip.split(',')[0]).iptype()
    else:
        return ip

In [108]:
def protoLang(line):
    proto = line['protocol'] 
    size = "size: " + str(line['frame.len'])
    message = ""
    
    try:
        
        ########################################
        ### Extract base line protocol       ###
        ########################################
        
        #UDP based?
        if line['frame.protocols'].find('udp') > 0:     
            src =  " ".join(['device:',line['org.src'], 'identifier:', str(line['tail.src'])]) if line['zone.src'] == 'PRIVATE' \
                else " ".join(['service:',line['org.src']])
            srcport = 'port: ' + str(int(line['udp.srcport']))
            dst =  "" if pd.isna(line['org.dst']) \
                    else (" ".join(['device:', line['org.dst'],'identifier:', str(line['tail.dst'])]) if line['zone.dst'] == 'PRIVATE' \
                            else (" ".join(['service:',line['org.src']]) if line['zone.dst'] == 'PUBLIC' else ''))
            dstport = 'port: ' + str(int(line['udp.dstport']))
        
        #UDP based?
        if line['frame.protocols'].find('tcp') > 0:     
            src =  " ".join(['device:',line['org.src'], 'identifier:', str(line['tail.src'])]) if line['zone.src'] == 'PRIVATE' \
                else " ".join(['service:',line['org.src']])
            srcport = 'port: ' + str(int(line['tcp.srcport']))
            dst =  "" if pd.isna(line['org.dst']) \
                    else (" ".join(['device:', line['org.dst'],'identifier:', str(line['tail.dst'])]) if line['zone.dst'] == 'PRIVATE' \
                            else (" ".join(['service:',line['org.src']]) if line['zone.dst'] == 'PUBLIC' else ''))
            dstport = 'port: ' + str(int(line['tcp.dstport']))
        
        ## ICMPv6
        ## TODO: There is more information to be extracted here
        elif proto == 'icmpv6':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join(['device',src,proto,size]).lower()
        
        
        #IP (L3) that is neither of those
        elif line['frame.protocols'].find('ip') > 0:
            src =  " ".join(['device', line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else " ".join(['service', line['org.src']])
            dst =  "" if pd.isna(line['org.dst']) \
                    else (" ".join(['device', line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                            else (" ".join(['service', line['org.src']]) if line['zone.dst'] == 'PUBLIC' else ''))
        
        #L2
        elif line['frame.protocols'].find('ethertype') > 0:
            src =  " ".join(['device',line['org.src'],line['tail.src']]) 
            dst =  " ".join(['device', line['org.dst'],line['tail.dst']]) if pd.notna(line['org.dst']) else ''
        
        #this should very rarely happen, if at all!
        else:
            print("undetected protocol(1): {}".format(line))
            return line['frame.protocols'].lower()
        
        
        
        ########################################
        ### Extract higher layer protocol    ###
        ### and additional meta data         ###
        ########################################        
         # x509ce (certificate exchange)
        if line['frame.protocols'].find('x509') > 0:
            proto = "protocol: " + proto
            x509ce_message = ''
            if pd.notna(line['x509sat.printableString']):
                x509ce_message += line['x509sat.printableString']
            if pd.notna(line['x509sat.uTF8String']):
                x509ce_message += line['x509sat.uTF8String']
            return " ".join([src,srcport,proto,size,dst,dstport,x509ce_message]).lower()
        
        ## ssl 
        elif proto == 'ssl':
            proto = "protocol: " + proto
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
                
        
        ## tcp:data
        elif line['frame.protocols'].find('tcp:data') > 0:
            proto = "protocol: tcp:data"
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## tcp:stun
        ## TODO: stun.att.software "stun.att.realm": "belkin.org",
        elif line['frame.protocols'].find('tcp:stun') > 0:
            proto = 'protocol: tcp:stun'
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## upd:data
        ## TODO: Detect broadcasts
        elif line['frame.protocols'].find('udp:data') > 0:
            proto = 'protocol: udp:data'
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## udp:nbns (netbios)
        ## TODO: nbns.name": "MACBOOKAIR-7040<00> (Workstation\/Redirector)",
        elif line['frame.protocols'].find('udp:nbns') > 0:
            proto = 'protocol: udp:nbns'
            return " ".join([src,srcport,proto,size]).lower()
        
        ## upd:bootp (dhcp)
        ## TODO: "bootp.option.hostname": "amazon-c4475da2a"
        ## TODO: "bootp.type": "2" (1 is request, 2 is reply)
        ## if it is a reply, add the dst IP address too
        elif line['frame.protocols'].find('udp:bootp') > 0:
            proto = 'protocol: udp:bootp'
            return " ".join([src,srcport,proto,size]).lower()
        
        ## udP:gquic
        ## TODO: "gquic.tag.sni": "0.docs.google.com"
        ## TODO: "gquic.tag.uaid": "Chrome\/65.0.3325.181 Intel Mac OS X 10_13_3" 
        ## TODO: "gquic.tag": "CHLO" (client hello)
        elif line['frame.protocols'].find('udp:quic') > 0:
            proto = 'protocol: udp:gquic'
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## ssdp (simple service discovery protocol)
        ## TODO: http.server: "Linux UPnP\/1.0 Sonos\/41.3-50131 (ZPS12)
        ## TODO: http.unknown_header: "HOUSEHOLD.SMARTSPEAKER.AUDIO: Sonos_hOcMvZ0JBvDVZz7BXZc5ILQAT5.Cd7MOjIUy3HWHWEXItIZ\\r\\n",
        ## TODO: http.request.full_uri: "http:\/\/239.255.255.250:1900*",
        elif proto == 'ssdp':
            proto = "protocol: " + proto
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## db-lsp-disc:json (Dropbox Lan sync Discovery Protocol)
        ## TODO: Detect and use Broadcast
        ## TODO: eth.addr_resolved": "Broadcast"
        elif line['frame.protocols'].find('db-lsp-disc:json') > 0:
            proto = 'protocol: db-lsp-disc'
            return " ".join([src,srcport,proto,size,dstport]).lower()
        
        ## ntp (network time protocol)
        elif proto == 'ntp':
            proto = "protocol: " + proto
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        ## STP (spanning tree protocol)
        elif proto == 'stp':
            proto = "protocol: " + proto
            return " ".join([src,proto,size]).lower()
        
        ## mdns
        elif proto == 'mdns':
            proto = "protocol: " + proto
            mdns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')    
            return " ".join([src,srcport,proto,size,mdns_string]).lower()
        
        ## igmp
        ##TODO: look into IGMP
        elif proto == 'igmp':
            proto = "protocol: " + proto
            return " ".join([src,proto,dst]).lower()
        
        ## dns
        elif proto == 'dns':
            proto = "protocol: " + proto
            dns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')
            return " ".join([src,srcport,proto,size,dst,dstport,dns_string]).lower()

        ## http 
        elif proto == 'http':
            proto = "protocol: " + proto
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            return " ".join([src,srcport,proto,size,dst,dstport,payload]).lower()
        
        ## http:data
        ## TODO: add http.file_data content
        elif line['frame.protocols'].find('http:data') > 0:
            proto = 'protocol: http:data'
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            return " ".join([src,srcport,proto,size,dst,dstport,payload]).lower()
        
        ## http:media
        ## TODO: add  http.content_type , http.content_length
        elif line['frame.protocols'].find('http:media') > 0:
            proto = 'protocol: http:media'
            return " ".join([src,srcport,proto,size,dst,dstport]).lower()
        
        
        ## icmp:data
        ## TODO: Consider adding icmp.type to the frame / protocol language
        ## TODO: Detect broadcast
        elif line['frame.protocols'].find('icmp:data') > 0:
            proto = 'protocol: icmp:data'
            return " ".join([src,proto,size,dst]).lower()
        
        elif proto == 'arp':
            #return " ".join([src,proto,size,dst])
            return "".lower()
        
        ## ethertype:data
        elif line['frame.protocols'].find('ethertype:data') > 0:
            proto = 'protocol: ethertype:data'
            return " ".join([src,proto,size]).lower()
        
        # undetected protocols
        else:
            #is it based on UDP or TCP?
            proto = "protocol: " + proto
            if line['frame.protocols'].find('udp') +\
                line['frame.protocols'].find('tcp') > 0:
                return " ".join([src,srcport,proto,size,dst,dstport]).lower()
            elif line['frame.protocols'].find('ip')> 0:
                return " ".join([src,proto,size,dst]).lower()
            elif line['frame.protocols'].find('ethertype')> 0:
                return " ".join([src,proto,size,dst]).lower()
            else:
                print("undetected protocol(2): {}".format(str(line)))
                return " ".join([src,proto,size,dst]).lower()
            
    except ValueError:
        print('Exception!!')
        print(line)

In [116]:
def preprocessDF(df):
    df['oui.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
    df['tail.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))
    df['oui.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
    df['tail.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))
    
    #create dask frame
    df = dd.from_pandas(df, npartitions=8)
    
    #add OUI
    df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.src',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
    df = df.rename(columns={'Organization Name':'org.src'})

    df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.dst',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
    df = df.rename(columns={'Organization Name':'org.dst'})
    
    # let's get rid of white spaces
    df['org.src'] = df['org.src'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x, meta = str)
    df['org.dst'] = df['org.dst'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x, meta = str)
    
    # clean up org and tail for dst columns
    df['org.dst'] = df['org.dst'].fillna('')
    df['tail.dst'] = df[['org.dst','tail.dst','eth.dst_resolved']].apply(lambda row: row['tail.dst'] if (row['org.dst'] != '') \
                                                                     else row['eth.dst_resolved'], axis = 1, meta = str)
    # adding private (local) and public (internet) traffic zones
    df['zone.src'] = df['ip.src'].apply(lambda x: determineZone(x), meta = str)
    df['zone.dst'] = df['ip.dst'].apply(lambda x: determineZone(x), meta = str)
    
    # clean up IP addresses that have multiple source or dest values
    df['ip.src'] = df['ip.src'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x, meta = str)
    df['ip.dst'] = df['ip.dst'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x, meta = str)
    
    # the communication protocol column
    df['protocol'] = df['frame.protocols'].apply(lambda x: x.split(':')[-1], meta = str)
    
    # create a list of public IP addresses that we need to resolve
    public_IPs = set()
    public_IPs_orgs = {}
    for index,row in df.iterrows():
        if (row['zone.src'] == "PUBLIC"):
            public_IPs.add(row['ip.src'])
        if (row['zone.dst'] == "PUBLIC"):
            public_IPs.add(row['ip.dst'])
    
    # let's look up these public IP addresses in our db and replace with the service name
    for ip in public_IPs:
        try:
            public_IPs_orgs[ip] = ASN_df[(ASN_df['start.dec'] <= float(IPy(ip).strDec())) & 
                                    (ASN_df['end.dec'] >= float(IPy(ip).strDec()))]['organization']\
                                    .values[0].split(' ')[0]
        except:
            #print(ip)
            public_IPs_orgs[ip] = 'UNLISTED'

    # let's replace org data for public IPs based on the ASN information
    df['org.src'] = df[['zone.src','ip.src','org.src']].apply(lambda x: ( public_IPs_orgs[x['ip.src']] if x['zone.src'] == "PUBLIC" else x['org.src']), axis=1, meta = str)
    df['org.dst'] = df[['zone.dst','ip.dst','org.dst']].apply(lambda x: ( public_IPs_orgs[x['ip.dst']] if x['zone.dst'] == "PUBLIC" else x['org.dst']), axis=1, meta = str)

    # let's create a sentence for each packet
    df['sentence'] = df.apply(lambda line: protoLang(line), axis=1, meta = str)
    
    # Aggregating these sentences for each device is non-trivial. 
    # read on!
    
    # Source   Destination
    # Private  Private   => L3: local to local 
    # Private  Public    => L3: local to Internet
    # Public   Private   => L3: Internet to Local
    # Public   Public    => Shouldn't be possible
    # NaN      whatever  => If the source device has no public or private zone (therefore NaN), that means it doesn't have 
    #                       an IP address, which means it is almost certainly layer 2 (ethernet traffic)

    # 3 aggregation
    # 1- source = Private (local to local and local to Internet, so to get local devices traffic aggregate by src)
    # 2- source = Public  (Internet to local, so to get local devices traffic aggregate by dst)
    # 3- source = NA  (L2, the local device can be the receiver or sender)

    # L3 rraffic from the local network to either the local network or internet
    # So aggregate the source local network device communication
    df_agg1 = df[df['zone.src'] == 'PRIVATE']
    df_agg1 = df_agg1.groupby('eth.src')['sentence'].apply(lambda x: "".join(x + ("" if len(x) == 3 else ". ")), meta = str).compute()
    df_agg1 = pd.Series.to_frame(df_agg1).reset_index()
    
    # L3 traffic from the public network to the local network (we can't observe Internet to Internet traffic )
    # So aggregate the destination local network device communication
    df_agg2 = df[df['zone.src'] == 'PUBLIC']
    df_agg2 = df_agg2.groupby('eth.dst')['sentence'].apply(lambda x: "".join(x + ("" if len(x) == 3 else ". ")), meta = str).compute()
    df_agg2 = pd.Series.to_frame(df_agg2).reset_index()
    
    # L2 traffic from the local network to the local network
    # A given IoT device can be a source or destination
    # so we have to aggregate it in both direction
    df_agg3 = df[~df['zone.src'].isin(['PUBLIC','PRIVATE'])]

    # aggregate where the IoT device is the source
    df_agg3_1 = df_agg3.groupby('eth.src')['sentence'].apply(lambda x: "".join(x + ("" if len(x) == 3 else ". ")), meta = str).compute()
    df_agg3_1 = pd.Series.to_frame(df_agg3_1).reset_index()

    # aggregate where the IoT device is the destination
    df_agg3_2 = df_agg3.groupby('eth.dst')['sentence'].apply(lambda x: "".join(x + ("" if len(x) == 3 else ". ")), meta = str).compute()
    df_agg3_2 = pd.Series.to_frame(df_agg3_2).reset_index()

    final_agg = df_agg1.merge(df_agg2, left_on='eth.src', right_on='eth.dst', how = 'left').drop('eth.dst', axis = 1)
    final_agg = final_agg.merge(df_agg3_1, left_on='eth.src', right_on='eth.src', how = 'outer')
    final_agg = final_agg.merge(df_agg3_2, left_on='eth.src', right_on='eth.dst', how = 'left').drop('eth.dst', axis = 1)
    final_agg.columns = ["eth", "sentence.L3.src", "sentence.L3.dst", "sentence.L2.src", "sentence.L2.dst"]

    final_agg['sentence'] = final_agg.apply(lambda row: \
                            row['sentence.L3.src'] if pd.notna(row['sentence.L3.src']) else "" + \
                            row['sentence.L3.dst'] if pd.notna(row['sentence.L3.dst']) else "" + \
                            row['sentence.L2.src'] if pd.notna(row['sentence.L2.src']) else "" + \
                            row['sentence.L2.dst'] if pd.notna(row['sentence.L2.dst']) else "", axis=1)

    final_agg = final_agg.drop(['sentence.L3.src','sentence.L3.dst','sentence.L2.src','sentence.L2.dst'], axis = 1)
    return final_agg

In [117]:
sydney_1M = pd.read_csv("./packets_train_1M.csv", sep='\t')
sydney_1M_preprocessed = preprocessDF(sydney_1M)

In [118]:
sydney_1M_preprocessed.iloc[0,1]

'device withings 1b6f96 port: 68 protocol: udp:bootp size: 342. device withings 1b6f96 port: 68 protocol: udp:bootp size: 342. device withings 1b6f96 port: 68 protocol: udp:bootp size: 342. device withings 1b6f96 port: 49153 protocol: dns size: 80 device tp-link-technologies-coltd 5133ea port: 53 scalews.withings.net . device: withings identifier: 1b6f96 port: 49153 protocol: tcp size: 58 service: withings port: 80. device: withings identifier: 1b6f96 port: 49153 protocol: tcp size: 54 service: withings port: 80. device: withings identifier: 1b6f96 port: 49153 protocol: urlencoded-form size: 237 service: withings port: 80. device: withings identifier: 1b6f96 port: 49153 protocol: tcp size: 54 service: withings port: 80. device: withings identifier: 1b6f96 port: 49153 protocol: urlencoded-form size: 393 service: withings port: 80. device: withings identifier: 1b6f96 port: 49153 protocol: tcp size: 54 service: withings port: 80. device: withings identifier: 1b6f96 port: 49153 protocol: t

In [119]:
known_devices = pd.read_csv('sydney_devices.csv')
known_devices['OUI'] = known_devices['MAC address'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
known_devices['Manufacturer Device Type'] = ([' '.join(row) for row in 
                        zip(known_devices["Manufacturer"],known_devices["Device Type"])])
known_devices['Manufacturer Device Type'] = known_devices['Manufacturer Device Type'].apply(lambda x: x.strip())
known_devices['MAC address'] = known_devices['MAC address'].apply(lambda x: x.strip())

In [120]:
sydney_1M_preprocessed = sydney_1M_preprocessed.merge(known_devices[['MAC address','Manufacturer Device Type']], 
                left_on='eth', right_on='MAC address', how = 'left').drop('MAC address', axis = 1)
sydney_1M_preprocessed

Unnamed: 0,eth,sentence,Manufacturer Device Type
0,00:24:e4:1b:6f:96,device withings 1b6f96 port: 68 protocol: udp:...,Withings Scale
1,08:21:ef:3b:fc:e3,device: samsung-electronics-coltd identifier: ...,Samsung Tablet
2,18:b4:30:25:be:e4,device nest-labs-inc 25bee4 port: 68 protocol:...,Nest Smoke Alarm
3,00:24:e4:11:18:a8,device: withings identifier: 1118a8 port: 3780...,Withings Baby Monitor
4,18:b7:9e:02:20:44,device: invoxia identifier: 022044 port: 40234...,Triby Speaker
5,44:65:0d:56:cc:d3,device amazon-technologies-inc 56ccd3 port: 60...,Amazon Echo
6,74:2f:68:81:69:42,device azurewave-technology-inc 816942 port: 6...,Azure Laptop
7,f4:f2:6d:93:51:f1,device: tp-link-technologies-coltd identifier:...,TP-Link Camera
8,14:cc:20:51:33:ea,device tp-link-technologies-coltd 5133ea port:...,TP-Link Router
9,30:8c:fb:2f:e4:b2,device: dropcam identifier: 2fe4b2 port: 46330...,Dropcam Camera


In [121]:
#sydney_1M_preprocessed.to_csv("sydney_1M_preprocessed.csv", header=None, index=None, sep=' ')
sydney_1M_preprocessed.columns = ['eth','sentence','label']
# prune out empty sentences
sydney_1M_preprocessed = sydney_1M_preprocessed.loc[sydney_1M_preprocessed['sentence'] != '']
# write out to a csv file for the next round
sydney_1M_preprocessed.to_csv("sydney_1M_preprocessed.csv",index=None)

In [164]:
import csv
sydney_1M_embeddings = sydney_1M_preprocessed.sentence.to_frame()
sydney_1M_embeddings.to_csv('sydney_1M_embeddings.txt', 
                            header=None, index=None, sep=' ', quoting = csv.QUOTE_NONE, escapechar=' ')

In [165]:
#! head -n 1 sydney_1M_embeddings.txt

# Preparing test data

In [122]:
daghan_df = pd.read_csv("./daghan.csv", sep='\t')
daghan_preprocessed = preprocessDF(daghan_df)
daghan_preprocessed.head()

Unnamed: 0,eth,sentence
0,00:18:0a:12:2f:2d,device: cisco-meraki identifier: 122f2d port: ...
1,b8:e9:37:8c:da:3c,device: sonos-inc identifier: 8cda3c port: 413...
2,28:f0:76:31:d3:58,device: apple-inc identifier: 31d358 port: 654...
3,00:18:0a:7d:01:ce,device cisco-meraki 7d01ce protocol: icmp:data...
4,40:cb:c0:bc:36:7e,device apple-inc bc367e port: 5353 protocol: m...


In [123]:
known_devices = pd.read_csv('known_devices.csv')
known_devices['OUI'] = known_devices['MAC address'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
known_devices['Manufacturer Device Type'] = ([' '.join(row) for row in 
                        zip(known_devices["Manufacturer"],known_devices["Device Type"])])
known_devices['Manufacturer Device Type'] = known_devices['Manufacturer Device Type'].apply(lambda x: x.strip())
known_devices['MAC address'] = known_devices['MAC address'].apply(lambda x: x.strip())

In [124]:
daghan_preprocessed = daghan_preprocessed.merge(known_devices[['MAC address','Manufacturer Device Type']], 
                left_on='eth', right_on='MAC address', how = 'left').dropna().drop('MAC address', axis = 1)

In [125]:
daghan_preprocessed.columns = ['eth','sentence','label']
daghan_preprocessed = daghan_preprocessed.loc[daghan_preprocessed['sentence'] != '']
daghan_preprocessed.to_csv("daghan_preprocessed.csv", index=None)

In [126]:
daghan_preprocessed

Unnamed: 0,eth,sentence,label
0,00:18:0a:12:2f:2d,device: cisco-meraki identifier: 122f2d port: ...,Meraki Phone
1,b8:e9:37:8c:da:3c,device: sonos-inc identifier: 8cda3c port: 413...,Sonos Speaker
2,28:f0:76:31:d3:58,device: apple-inc identifier: 31d358 port: 654...,Apple Computer
4,40:cb:c0:bc:36:7e,device apple-inc bc367e port: 5353 protocol: m...,Apple TV
5,44:65:0d:90:60:3a,device amazon-technologies-inc 90603a port: 55...,Amazon Echo
6,78:28:ca:32:4b:28,device sonos-inc 324b28 port: 43082 protocol: ...,Sonos Speaker
8,38:c9:86:40:7c:a6,device apple-inc 407ca6 protocol: icmp:data si...,Apple Computer
10,84:38:35:5a:70:40,device: apple-inc identifier: 5a7040 port: 595...,Apple Computer
12,5c:aa:fd:4c:87:a0,device: sonos-inc identifier: 4c87a0 port: 446...,Sonos Speaker
13,78:28:ca:03:80:0c,device: sonos-inc identifier: 03800c port: 570...,Sonos Speaker
