# IoT Project

### We are going to load up 2 databases  
#### Organization Unique Identifier (OUI)

OUI is a unique identifier per organization
The first 6 alphanumerical characters of the MAC address for each device is the OUI

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from scapy.all import *
from IPy import IP as IPy
import pprint


OUI_df = pd.read_csv('oui.csv')
OUI_df.head()

Unnamed: 0,Registry,Assignment,Organization Name,Organization Address
0,MA-L,E043DB,"Shenzhen ViewAt Technology Co.,Ltd.","9A,Microprofit,6th Gaoxin South Road, High-Tec..."
1,MA-L,2405F5,Integrated Device Technology (Malaysia) Sdn. Bhd.,"Phase 3, Bayan Lepas FIZ Bayan Lepas Penang MY..."
2,MA-L,3CD92B,Hewlett Packard,11445 Compaq Center Drive Houston US 77070
3,MA-L,9C8E99,Hewlett Packard,11445 Compaq Center Drive Houston US 77070
4,MA-L,B499BA,Hewlett Packard,11445 Compaq Center Drive Houston US 77070


#### Internet IP address <-> Organization mapping

### Note:   
you need to download this file (large) to your local directory!  
https://iptoasn.com/data/ip2asn-combined.tsv.gz   
and unzip it (so you have a local copy of ip2asn-combined.tsv)

In [2]:
ASN_df = pd.read_csv("./ip2asn-combined.tsv", sep='\t', header=None)
ASN_df.columns = ['start','end','asn','country','organization']
ASN_df.head(100)

Unnamed: 0,start,end,asn,country,organization
0,0.0.0.1,0.255.255.255,0,,Not routed
1,1.0.0.0,9.255.255.255,0,,Not routed
2,10.0.0.0,10.255.255.255,0,,Not routed
3,11.0.0.0,100.63.255.255,0,,Not routed
4,100.64.0.0,100.127.255.255,0,,Not routed
5,100.128.0.0,126.255.255.255,0,,Not routed
6,127.0.0.0,127.255.255.255,0,,Not routed
7,128.0.0.0,169.253.255.255,0,,Not routed
8,169.254.0.0,169.254.255.255,0,,Not routed
9,169.255.0.0,172.15.255.255,0,,Not routed


In [3]:
# remove all the "Not routed" rows
ASN_df = ASN_df[ASN_df['organization'] != 'Not routed']

In [4]:
# add numerical representation for the start and end IP range for faster org search later
ASN_df['start.dec'] = ASN_df['start'].apply(lambda x: float(IPy(x).strDec()))
ASN_df['end.dec'] = ASN_df['end'].apply(lambda x: float(IPy(x).strDec()))

In [5]:
ASN_df.head()

Unnamed: 0,start,end,asn,country,organization,start.dec,end.dec
29,1.0.4.0,1.0.4.255,56203,AU,GTELECOM-AUSTRALIA Gtelecom-AUSTRALIA,16778240.0,16778495.0
30,1.0.5.0,1.0.5.255,56203,AU,GTELECOM-AUSTRALIA Gtelecom-AUSTRALIA,16778496.0,16778751.0
31,1.0.6.0,1.0.6.255,56203,AU,GTELECOM-AUSTRALIA Gtelecom-AUSTRALIA,16778752.0,16779007.0
32,1.0.7.0,1.0.7.255,56203,AU,GTELECOM-AUSTRALIA Gtelecom-AUSTRALIA,16779008.0,16779263.0
34,1.0.16.0,1.0.16.255,2519,Unknown,AS2519,16781312.0,16781567.0


## Device labeling  
The packet traces we are using come from my home network. I know what these devices are. I am going to import the device information for these devices and merge it with the network traffic dataframe we are using.

In [53]:
known_devices = pd.read_csv('known_devices.csv')
known_devices['OUI'] = known_devices['MAC address'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
known_devices['Manufacturer Device Type'] = ([' '.join(row) for row in 
                        zip(known_devices["Manufacturer"],known_devices["Device Type"])])
known_devices


Unnamed: 0,Description,Last seen,Usage,OS,IPv4 address,Policy,MAC address,Manufacturer,Device Type,OUI,Manufacturer Device Type
0,Daghan deskphone,6/6/18 11:29,962619.1,Meraki,192.168.1.236,normal,00:18:0a:12:2f:2d,Meraki,Phone,00180A,Meraki Phone
1,Nest,6/6/18 11:30,83495.0,Raspberry Pi,192.168.1.235,normal,18:b4:30:0d:87:18,Nest Labs,Thermostat,18B430,Nest Labs Thermostat
2,iPhone,5/19/18 18:11,4825600.0,Apple iPhone,192.168.1.9,Privacy please,18:f6:43:57:fc:8f,Apple,iPhone,18F643,Apple iPhone
3,Daghan-Home,6/6/18 11:30,31267720.0,Mac OS X 10.13,192.168.1.233,normal,28:f0:76:31:d3:58,Apple,Computer,28F076,Apple Computer
4,DaghanspleWatch,6/5/18 5:43,267294.0,iOS,192.168.1.7,normal,30:63:6b:65:e4:04,Apple,Watch,30636B,Apple Watch
5,daghan-macbook-8,6/6/18 11:21,26054380.0,Mac OS X 10.13,192.168.1.21,Privacy please,34:36:3b:7f:3f:a0,Apple,Computer,34363B,Apple Computer
6,Daghan-Home,6/6/18 11:29,215554.9,Mac OS X,192.168.1.224,normal,38:c9:86:40:7c:a6,Apple,Computer,38C986,Apple Computer
7,Apple-TV-3,6/3/18 19:35,1.0,iOS,192.168.1.247,normal,40:cb:c0:bc:36:7c,Apple,TV,40CBC0,Apple TV
8,Apple-TV-3,6/6/18 11:29,643408.9,iOS,192.168.1.249,normal,40:cb:c0:bc:36:7e,Apple,TV,40CBC0,Apple TV
9,44:65:0d:66:72:60,6/6/18 1:22,357305.0,Generic Linux,192.168.1.19,normal,44:65:0d:66:72:60,Amazon Technologies,Echo,44650D,Amazon Technologies Echo


In [6]:
# baseline
tshark_cmd = "-T fields -E header=y -e frame.number -e frame.time -e eth.src -e eth.dst -e frame.protocols"

#add IP/TCP/UDP/ICMP layers
tshark_cmd += " -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport"

# add DSN / mDNS layer
tshark_cmd += " -e dns.qry.name -e dns.resp.name -e dns.cname -e dns.a"

# add HTTP layer
tshark_cmd += " -e http.request.method -e http.request.uri -e http.user_agent -e http.host"

# add SSL certificate layer
tshark_cmd += " -e x509sat.printableString -e x509sat.uTF8String"


In [7]:
!tshark -tud -n -r ./packet_capture.pcap {tshark_cmd} > packets.csv

In [8]:
#!head -n 5 packets.csv 
#!tail -n 5 packets.csv

In [9]:
df = pd.read_csv("./packets.csv", sep='\t')

In [10]:
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,dns.qry.name,dns.resp.name,dns.cname,dns.a,http.request.method,http.request.uri,http.user_agent,http.host,x509sat.printableString,x509sat.uTF8String
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,,,,
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,,,,
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,,,,
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,,,,,,,,,
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,,,,,,,,,


In [13]:
#extract OUI from eth.src
df['oui.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
df['tail.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))
df['oui.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
df['tail.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))

In [14]:
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,http.request.method,http.request.uri,http.user_agent,http.host,x509sat.printableString,x509sat.uTF8String,oui.src,tail.src,oui.dst,tail.dst
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,843835,5A7040,E0553D,024121
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,843835,5A7040,E0553D,024121
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,843835,5A7040,E0553D,024121
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,,,,,,E0553D,024121,843835,5A7040
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,,,,,,E0553D,024121,843835,5A7040


In [15]:
df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.src',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
df.rename(columns={'Organization Name':'org.src'}, inplace=True)

df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.dst',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
df.rename(columns={'Organization Name':'org.dst'}, inplace=True)


In [16]:
# let's get rid of white spaces
df['org.src'] = df['org.src'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x)
df['org.dst'] = df['org.dst'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x)

In [17]:
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,http.user_agent,http.host,x509sat.printableString,x509sat.uTF8String,oui.src,tail.src,oui.dst,tail.dst,org.src,org.dst
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,,,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,,,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc


In [18]:
# This function determines if the IP 
# address is internal or public 
# according to RFC1918

def determineZone(ip):
    if pd.notna(ip):
        try:
            return IPy(ip).iptype()
        except ValueError:
            return IPy(ip.split(',')[0]).iptype()
    return ip

In [19]:
df['zone.src'] = df['ip.src'].apply(lambda x: determineZone(x))
df['zone.dst'] = df['ip.dst'].apply(lambda x: determineZone(x))

In [20]:
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,x509sat.printableString,x509sat.uTF8String,oui.src,tail.src,oui.dst,tail.dst,org.src,org.dst,zone.src,zone.dst
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc,PUBLIC,PRIVATE
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc,PUBLIC,PRIVATE


In [21]:
# clean up IP addresses that have multiple source or dest values
df['ip.src'] = df['ip.src'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)
df['ip.dst'] = df['ip.dst'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)

In [22]:
df['protocol'] = df['frame.protocols'].apply(lambda x: x.split(':')[-1])
#df = df.drop(['frame.protocols'], axis = 1)
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,x509sat.uTF8String,oui.src,tail.src,oui.dst,tail.dst,org.src,org.dst,zone.src,zone.dst,protocol
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC,ssl
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC,ssl
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC,ssl
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc,PUBLIC,PRIVATE,tcp
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc,PUBLIC,PRIVATE,tcp


In [23]:
# let's create a list of all the public IPs we see
public_IPs = set()

In [24]:
for index,row in df.iterrows():
    if (row['zone.src'] == "PUBLIC"):
        public_IPs.add(row['ip.src'])
    if (row['zone.dst'] == "PUBLIC"):
        public_IPs.add(row['ip.dst'])

In [25]:
# this is a crazy hack... it is best I explain it live
# but all it does is to match a public IP to the Internet wide Organization (ASN) it belongs to
public_IPs_orgs = {}
for ip in public_IPs:
    public_IPs_orgs[ip] = ASN_df[(ASN_df['start.dec'] <= float(IPy(ip).strDec())) & 
                                   (ASN_df['end.dec'] >= float(IPy(ip).strDec()))]['organization']\
                        .values[0].split(' ')[0]

#public_IPs_orgs   

In [26]:
# replace org data for public IPs based on the ASN information
df['org.src'] = df[['zone.src','ip.src','org.src']].apply(lambda x: ( public_IPs_orgs[x['ip.src']] if x['zone.src'] == "PUBLIC" else x['org.src']), axis=1)
df['org.dst'] = df[['zone.dst','ip.dst','org.dst']].apply(lambda x: ( public_IPs_orgs[x['ip.dst']] if x['zone.dst'] == "PUBLIC" else x['org.dst']), axis=1)

In [27]:
df['protocol'].unique()

array(['ssl', 'tcp', 'data', 'arp', 'icmpv6', 'mdns', 'stp', 'dns',
       'http', 'data-text-lines', 'ntp', 'media', 'stun', 'nbns', 'bootp',
       'gquic', 'igmp', 'ssdp', 'x509ce', 'json'], dtype=object)

In [93]:
# protocols we see
# 'gquic', 'igmp', 'ssdp', 'x509ce', 'json'

def protoLang(line):
    proto = line['protocol']
    try:
        
         # x509ce (certificate exchange)
        if line['frame.protocols'].find('x509') > 0:
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            x509ce_message = ''
            if pd.notna(line['x509sat.printableString']):
                x509ce_message += line['x509sat.printableString']
            if pd.notna(line['x509sat.uTF8String']):
                x509ce_message += line['x509sat.uTF8String']
            return " ".join([src,srcport,proto,dst,dstport,x509ce_message])
        
        ## ssl 
        elif proto == 'ssl':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
                
        
        ## tcp:data
        elif line['frame.protocols'].find('tcp:data') > 0:
            proto = 'tcp:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## tcp:stun
        ## TODO: stun.att.software "stun.att.realm": "belkin.org",
        elif line['frame.protocols'].find('tcp:stun') > 0:
            proto = 'tcp:stun'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## tcp
        elif proto == 'tcp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## upd:data
        ## TODO: Detect broadcasts
        elif line['frame.protocols'].find('udp:data') > 0:
            proto = 'udp:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            if line['zone.dst'] == 'PRIVATE':
                if pd.notna(line['org.dst']):
                    dst =  " ".join([line['org.dst'],line['tail.dst']])
                else:
                    dst = ''
            elif line['zone.dst'] == 'PUBLIC':
                dst = line['org.dst']
            else:
                dst = ''
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## upd:nbns (netbios)
        ## TODO: nbns.name": "MACBOOKAIR-7040<00> (Workstation\/Redirector)",
        elif line['frame.protocols'].find('udp:nbns') > 0:
            proto = 'udp:nbns'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            #dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    #else line['org.dst']
            #dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto])
        
        ## upd:bootp (dhcp)
        ## TODO: "bootp.option.hostname": "amazon-c4475da2a"
        ## TODO: "bootp.type": "2" (1 is request, 2 is reply)
        ## if it is a reply, add the dst IP address too
        elif line['frame.protocols'].find('udp:bootp') > 0:
            proto = 'udp:bootp'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            #dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    #else line['org.dst']
            #dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto])
        
        ## udP:gquic
        ## TODO: "gquic.tag.sni": "0.docs.google.com"
        ## TODO: "gquic.tag.uaid": "Chrome\/65.0.3325.181 Intel Mac OS X 10_13_3" 
        ## TODO: "gquic.tag": "CHLO" (client hello)
        elif line['frame.protocols'].find('udp:gquic') > 0:
            proto = 'udp:gquic'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## ssdp (simple service discovery protocol)
        ## TODO: http.server: "Linux UPnP\/1.0 Sonos\/41.3-50131 (ZPS12)
        ## TODO: http.unknown_header: "HOUSEHOLD.SMARTSPEAKER.AUDIO: Sonos_hOcMvZ0JBvDVZz7BXZc5ILQAT5.Cd7MOjIUy3HWHWEXItIZ\\r\\n",
        ## TODO: http.request.full_uri: "http:\/\/239.255.255.250:1900*",
        elif proto == 'ssdp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            if (line['zone.dst'] == 'PRIVATE'):
                dst = " ".join(line['org.dst'],line['tail.dst'])
            elif (line['zone.dst'] == 'RESERVED'):
                dst = line['eth.dst']
            else:
                dst = line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## db-lsp-disc:json (Dropbox Lan sync Discovery Protocol)
        ## TODO: Detect and use Broadcast
        ## TODO: eth.addr_resolved": "Broadcast"
        elif line['frame.protocols'].find('db-lsp-disc:json') > 0:
            proto = 'db-lsp-disc'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dstport])
        
        ## udp
        elif proto == 'udp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## ntp (network time protocol)
        elif proto == 'ntp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        
        ## ARP
        elif proto == 'arp':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if pd.notna(line['org.dst']) else '' 
            return " ".join([src,proto,dst])
            
        ## ICMPv6
        ## TODO: There is more information to be extracted here
        elif proto == 'icmpv6':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        ## STP (spanning tree protocol)
        elif proto == 'stp':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        ## mdns
        elif proto == 'mdns':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            srcport = str(int(line['udp.srcport']))
            dst = line['eth.dst']
            mdns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')    
            return " ".join([src,srcport,proto,mdns_string])
        
        ## igmp
        elif proto == 'igmp':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            dst = line['eth.dst']
            return " ".join([src,proto,dst])
        
        ## dns
        elif proto == 'dns':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            dns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')
            #print(dns_string)
            return " ".join([src,srcport,proto,dst,dstport,dns_string])

        ## http 
        elif proto == 'http':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            #print(payload)
            return " ".join([src,srcport,proto,dst,dstport,payload])
        
        ## http:data
        ## TODO: add http.file_data content
        elif line['frame.protocols'].find('http:data') > 0:
            proto = 'http:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            #print(payload)
            return " ".join([src,srcport,proto,dst,dstport,payload])
        
        ## http:media
        ## TODO: add  http.content_type , http.content_length
        elif line['frame.protocols'].find('http:media') > 0:
            proto = 'http:media'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        
        ## icmp:data
        ## TODO: Consider adding icmp.type to the frame / protocol language
        ## TODO: Detect broadcast
        elif line['frame.protocols'].find('icmp:data') > 0:
            proto = 'icmp:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            if line['zone.dst'] == 'PRIVATE':
                if pd.notna(line['org.dst']):
                    dst =  " ".join([line['org.dst'],line['tail.dst']])
                else:
                    dst = ""
            else:
                dst = line['org.dst']
            return " ".join([src,proto,dst])
        
        ## ethertype:data
        elif line['frame.protocols'].find('ethertype:data') > 0:
            proto = 'ethertype:data'
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        # anything else
        else:
            print("undetected protocol: {}".format(proto))
    except ValueError:
        print('Exception!!')
        print(line)
        
    

In [94]:
#df.iloc[0,:]

In [95]:
df['sentence'] = df.apply(protoLang, axis=1)
#df.loc[5391,:]

In [96]:
df[df['protocol'] == 'ssdp']

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,oui.src,tail.src,oui.dst,tail.dst,org.src,org.dst,zone.src,zone.dst,protocol,sentence
3377,3378,"Apr 27, 2018 17:15:32.971135000 PDT",78:28:ca:32:4b:28,01:00:5e:7f:ff:fa,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,239.255.255.250,,,43082.0,...,7828CA,324B28,01005E,7FFFFA,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp 01:00:5e:7f:ff:fa ...
3378,3379,"Apr 27, 2018 17:15:32.971555000 PDT",78:28:ca:32:4b:28,ff:ff:ff:ff:ff:ff,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,255.255.255.255,,,43082.0,...,7828CA,324B28,FFFFFF,FFFFFF,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp ff:ff:ff:ff:ff:ff ...
3399,3400,"Apr 27, 2018 17:15:33.221432000 PDT",78:28:ca:32:4b:28,01:00:5e:7f:ff:fa,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,239.255.255.250,,,43082.0,...,7828CA,324B28,01005E,7FFFFA,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp 01:00:5e:7f:ff:fa ...
3400,3401,"Apr 27, 2018 17:15:33.221768000 PDT",78:28:ca:32:4b:28,ff:ff:ff:ff:ff:ff,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,255.255.255.255,,,43082.0,...,7828CA,324B28,FFFFFF,FFFFFF,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp ff:ff:ff:ff:ff:ff ...
3436,3437,"Apr 27, 2018 17:15:33.471876000 PDT",78:28:ca:32:4b:28,01:00:5e:7f:ff:fa,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,239.255.255.250,,,43082.0,...,7828CA,324B28,01005E,7FFFFA,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp 01:00:5e:7f:ff:fa ...
3437,3438,"Apr 27, 2018 17:15:33.472259000 PDT",78:28:ca:32:4b:28,ff:ff:ff:ff:ff:ff,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,255.255.255.255,,,43082.0,...,7828CA,324B28,FFFFFF,FFFFFF,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp ff:ff:ff:ff:ff:ff ...
3457,3458,"Apr 27, 2018 17:15:33.722665000 PDT",78:28:ca:32:4b:28,01:00:5e:7f:ff:fa,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,239.255.255.250,,,43082.0,...,7828CA,324B28,01005E,7FFFFA,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp 01:00:5e:7f:ff:fa ...
3468,3469,"Apr 27, 2018 17:15:33.975031000 PDT",78:28:ca:32:4b:28,01:00:5e:7f:ff:fa,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,239.255.255.250,,,43082.0,...,7828CA,324B28,01005E,7FFFFA,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp 01:00:5e:7f:ff:fa ...
3482,3483,"Apr 27, 2018 17:15:34.223393000 PDT",78:28:ca:32:4b:28,01:00:5e:7f:ff:fa,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,239.255.255.250,,,43082.0,...,7828CA,324B28,01005E,7FFFFA,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp 01:00:5e:7f:ff:fa ...
3488,3489,"Apr 27, 2018 17:15:34.473780000 PDT",78:28:ca:32:4b:28,01:00:5e:7f:ff:fa,eth:ethertype:vlan:ethertype:ip:udp:ssdp,192.168.1.225,239.255.255.250,,,43082.0,...,7828CA,324B28,01005E,7FFFFA,Sonos-Inc,,PRIVATE,RESERVED,ssdp,Sonos-Inc 324B28 43082 ssdp 01:00:5e:7f:ff:fa ...


In [32]:
vocabulary = []
for line in df['sentence'].tolist():
    vocabulary += line.split()
print(len(vocabulary))

34868


In [33]:
unique_words = set(vocabulary)
print(len(unique_words))
print(unique_words)

374
{'1900', '52712', '59553', '59560', 'CA', '49406', '59631', '324B28', '59636', 'icmpv6', 'HL-L2380DW', '59545', 'COMCAST-7922', '34858', '67', 'pagead.l.doubleclick.net', 'Network,(c)', '59556', 'audio-fac.spotify.com', '44689', '18964', '172.217.6.78', 'tcp:data', 's3-directional-w.amazonaws.com,s3-1-w.amazonaws.com', 'www.example.net', 'ntp-g7g.amazon.com,ntp-g7g.amazon.com', '11046', '59498', '58270', '59637', '59549', '52518', '59565', '34.225.16.57,52.87.128.190,54.85.228.100,54.173.62.245,54.209.14.90,54.236.91.5,54.84.159.64,54.164.74.173', '59627', '49079', 'clients.l.google.com', '80', '41662', '61502', '46201', 'TV._raop._tcp.local,Apple', '46872', '35608', 'dns', 'Symantec', 'Root', '59586', '59532', 'Class', '54.239.27.11', 'OPENDNS', 'use', 'G4,SymantecPKI-1-534Washington,Seattle,Amazon.com,', 'x509ce', 'SHA2', '36313', '3', 'Inc.,dp-gw-na.amazon.com', 'f.d.4.b.1.3.e.f.f.f.c.9.c.7.6.b.0.0.0.0.0.0.0.0.0.0.0.0.0.8.e.f.ip6.arpa,linux-6.local,220.1.168.192.in-addr.arpa', '

In [177]:
src_agg_df = df.groupby('eth.src')['sentence'].agg(lambda x: "".join(x + " <EOS> ")).reset_index()
dst_agg_df = df.groupby('eth.dst')['sentence'].agg(lambda x: "".join(x + " <EOS> ")).reset_index()

In [178]:
final_agg_df = src_agg_df.merge(dst_agg_df, left_on='eth.src', right_on='eth.dst', how = 'outer')
final_agg_df.columns = ["eth.src", "sentence.src", "eth.dst", "sentence.dst"]
#TODO: Filter reserved MAC ranges

In [179]:
final_agg_df['eth'] = final_agg_df[['eth.src','eth.dst']]\
                        .apply(lambda row: row['eth.src'] if pd.notna(row['eth.src']) else row['eth.dst'], axis=1)

#TODO: Redo merging sentences so the sequence of the language is preserved
final_agg_df['sentence'] = final_agg_df[['sentence.src', 'sentence.dst']]\
        .apply(lambda row: (row['sentence.src'] + " " if pd.notna(row['sentence.src']) else "") +\
                           (row['sentence.dst'] + " " if pd.notna(row['sentence.dst']) else ""), axis=1)

In [180]:
final_agg_df = final_agg_df.drop(['eth.src','sentence.src','eth.dst','sentence.dst'], axis=1)

In [181]:
#final_agg_df[final_agg_df['eth.src'] == '5c:aa:fd:4c:92:86']
final_agg_df

Unnamed: 0,eth,sentence
0,00:18:0a:12:2f:2d,Cisco-Meraki 122F2D arp Cisco-Meraki 024121 <E...
1,00:18:0a:7d:01:ce,Cisco-Meraki 7D01CE icmp:data GOOGLE <EOS> Cis...
2,00:18:0a:7d:01:cf,Cisco-Meraki 7D01CF stp <EOS> Cisco-Meraki 7D0...
3,18:b4:30:0d:87:18,Nest-Labs-Inc 0D8718 arp Cisco-Meraki 024121 <...
4,18:f6:43:57:fc:8f,Apple-Inc 57FC8F arp Cisco-Meraki 024121 <EOS>...
5,28:f0:76:31:d3:58,Apple-Inc 31D358 65474 tcp GOOGLE 443 <EOS> Ap...
6,38:c9:86:40:7c:a6,Apple-Inc 407CA6 arp Cisco-Meraki 024121 <EOS>...
7,40:cb:c0:bc:36:7e,Apple-Inc BC367E icmpv6 <EOS> Apple-Inc BC367E...
8,44:65:0d:90:60:3a,Amazon-Technologies-Inc 90603A 55151 udp:data ...
9,5c:aa:fd:4c:87:a0,Sonos-Inc 4C87A0 arp Cisco-Meraki 024121 <EOS>...


In [182]:
final_agg_df = final_agg_df.merge(known_devices[list(['MAC address','Manufacturer Device Type'])], 
             left_on = 'eth',  right_on= 'MAC address', how = 'inner').drop(['eth','MAC address'], axis=1)
final_agg_df.columns = ['sentence','type']

In [183]:
final_agg_df.head()

Unnamed: 0,sentence,type
0,Cisco-Meraki 122F2D arp Cisco-Meraki 024121 <E...,Meraki Phone
1,Nest-Labs-Inc 0D8718 arp Cisco-Meraki 024121 <...,Nest Labs Thermostat
2,Apple-Inc 57FC8F arp Cisco-Meraki 024121 <EOS>...,Apple iPhone
3,Apple-Inc 31D358 65474 tcp GOOGLE 443 <EOS> Ap...,Apple Computer
4,Apple-Inc 407CA6 arp Cisco-Meraki 024121 <EOS>...,Apple Computer


In [199]:
final_agg_df.loc[3,'sentence']

'Apple-Inc 31D358 65474 tcp GOOGLE 443 <EOS> Apple-Inc 31D358 61502 tcp:data GOOGLE 4070 <EOS> Apple-Inc 31D358 52167 udp:data GOOGLE 443 <EOS> Apple-Inc 31D358 61502 tcp GOOGLE 4070 <EOS> Apple-Inc 31D358 62476 tcp GOOGLE 443 <EOS> Apple-Inc 31D358 61502 tcp:data GOOGLE 4070 <EOS> Apple-Inc 31D358 arp  <EOS> Apple-Inc 31D358 61502 tcp GOOGLE 4070 <EOS> Apple-Inc 31D358 61544 tcp GOOGLE 5228 <EOS> Apple-Inc 31D358 arp  <EOS> Apple-Inc 31D358 52167 udp:data GOOGLE 443 <EOS> Apple-Inc 31D358 65461 tcp GOOGLE 443 <EOS> Apple-Inc 31D358 arp  <EOS> Apple-Inc 31D358 65474 tcp GOOGLE 443 <EOS> Apple-Inc 31D358 49386 tcp GOOGLE 443 <EOS> Apple-Inc 31D358 arp  <EOS> Apple-Inc 31D358 62476 tcp GOOGLE 443 <EOS> Apple-Inc 31D358 arp  <EOS> Apple-Inc 31D358 65105 tcp GOOGLE 443 <EOS> Apple-Inc 31D358 65461 tcp GOOGLE 443 <EOS> Apple-Inc 31D358 57659 udp:data GOOGLE 443 <EOS> Apple-Inc 31D358 57659 udp:data GOOGLE 443 <EOS> Apple-Inc 31D358 57659 udp:data GOOGLE 443 <EOS> Apple-Inc 31D358 65474 ssl 

In [184]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(final_agg_df['type'])


LabelEncoder()

In [185]:
le.classes_

array(['Amazon Technologies Echo', 'Apple Computer', 'Apple TV',
       'Apple iPhone', 'Belkin Wallplug', 'Meraki Phone',
       'Nest Labs Thermostat', 'Nintendo Switch', 'Sonos Speaker'],
      dtype=object)

In [186]:
le.transform(final_agg_df['type'])
final_agg_df['label'] = le.transform(final_agg_df['type'])
final_agg_df = final_agg_df.drop(['type'], axis=1)
final_agg_df.head()

Unnamed: 0,sentence,label
0,Cisco-Meraki 122F2D arp Cisco-Meraki 024121 <E...,5
1,Nest-Labs-Inc 0D8718 arp Cisco-Meraki 024121 <...,6
2,Apple-Inc 57FC8F arp Cisco-Meraki 024121 <EOS>...,3
3,Apple-Inc 31D358 65474 tcp GOOGLE 443 <EOS> Ap...,1
4,Apple-Inc 407CA6 arp Cisco-Meraki 024121 <EOS>...,1


In [193]:
len(le.classes_)

9

# Are all devices there?

In [176]:
final_agg_df.head()

Unnamed: 0,sentence,labels
0,Cisco-Meraki 122F2D arp Cisco-Meraki 024121 <E...,5
1,Nest-Labs-Inc 0D8718 arp Cisco-Meraki 024121 <...,6
2,Apple-Inc 57FC8F arp Cisco-Meraki 024121 <EOS>...,3
3,Apple-Inc 31D358 65474 tcp GOOGLE 443 <EOS> Ap...,1
4,Apple-Inc 407CA6 arp Cisco-Meraki 024121 <EOS>...,1


## Let the training start

In [187]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.pandas_input_fn(
    final_agg_df, final_agg_df["label"], num_epochs=None, shuffle=True)

# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
    final_agg_df, final_agg_df["label"], shuffle=False)

In [188]:
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
    final_agg_df, final_agg_df["label"], shuffle=False)

In [189]:
embedded_text_feature_column = hub.text_embedding_column(
    key="sentence", 
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

In [194]:
estimator = tf.estimator.DNNClassifier(
    hidden_units=[500, 100],
    feature_columns=[embedded_text_feature_column],
    n_classes=len(le.classes_),
    optimizer=tf.train.AdagradOptimizer(learning_rate=0.003))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/2r/9kw2cykj317_f5bzwgspyfbr0000gn/T/tmprhiedh9g', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x128949748>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [195]:
# Training for 1,000 steps means 128,000 training examples with the default
# batch size. This is roughly equivalent to 5 epochs since the training dataset
# contains 25,000 examples.
estimator.train(input_fn=train_input_fn, steps=1000);

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/sentence_hub_module_embedding/module/embeddings/part_0:0 from checkpoint b'/var/folders/2r/9kw2cykj317_f5bzwgspyfbr0000gn/T/tfhub_modules/32f2b2259e1cc8ca58c876921748361283e73997/variables/variables' with embeddings
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/2r/9kw2cykj317_f5bzwgspyfbr0000gn/T/tmprhiedh9g/model.ckpt.
INFO:tensorflow:loss = 317.35834, step = 1
INFO:tensorflow:global_step/sec: 15.8828
INFO:tensorflow:loss = 5.729344, step = 101 (6.297 sec)
INFO:tensorflow:global_step/sec: 15.7079
INFO:tensorflow:loss = 2.2258756, step = 201 (6.366 sec)
INFO:tensorflow:global_step/sec: 15.4928
INFO:tensorflow:loss = 1.0092354, step = 301 (6.455 sec)
INFO:tensorfl

In [196]:
train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)

print("Training set accuracy: {accuracy}".format(**train_eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/sentence_hub_module_embedding/module/embeddings/part_0:0 from checkpoint b'/var/folders/2r/9kw2cykj317_f5bzwgspyfbr0000gn/T/tfhub_modules/32f2b2259e1cc8ca58c876921748361283e73997/variables/variables' with embeddings
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-06-30-14:23:38
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/2r/9kw2cykj317_f5bzwgspyfbr0000gn/T/tmprhiedh9g/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-06-30-14:23:40
INFO:tensorflow:Saving dict for global step 1000: accuracy = 1.0, average_loss = 0.0015652621, global_step = 1000, loss = 0.028174717
Training set accuracy: 1.0
