# IoT Project

In [7]:
import numpy as np
import pandas as pd
from collections import defaultdict
from scapy.all import *
from IPy import IP as IPy
import pprint
from sklearn import preprocessing
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


#### Internet IP address <-> Organization mapping

In [8]:
ASN_df = pd.read_csv("./ip2asn-combined.tsv", sep='\t', header=None)
ASN_df.columns = ['start','end','asn','country','organization']

# remove all the "Not routed" rows
ASN_df = ASN_df[ASN_df['organization'] != 'Not routed']

# add numerical representation for the start and end IP range for faster org search later
ASN_df['start.dec'] = ASN_df['start'].apply(lambda x: float(IPy(x).strDec()))
ASN_df['end.dec'] = ASN_df['end'].apply(lambda x: float(IPy(x).strDec()))

## Device labeling  
The packet traces we are using come from my home network. I know what these devices are. I am going to import the device information for these devices and merge it with the network traffic dataframe we are using.

In [9]:
OUI_df = pd.read_csv('oui.csv')
known_devices = pd.read_csv('known_devices.csv')
known_devices['OUI'] = known_devices['MAC address'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
known_devices['Manufacturer Device Type'] = ([' '.join(row) for row in 
                        zip(known_devices["Manufacturer"],known_devices["Device Type"])])

In [27]:
# baseline
tshark_cmd = "-T fields -E header=y -e frame.number -e frame.time -e eth.src -e eth.dst -e frame.protocols"

#add IP/TCP/UDP/ICMP layers
tshark_cmd += " -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport"

# add DSN / mDNS layer
tshark_cmd += " -e dns.qry.name -e dns.resp.name -e dns.cname -e dns.a"

# add HTTP layer
tshark_cmd += " -e http.request.method -e http.request.uri -e http.user_agent -e http.host"

# add SSL certificate layer
tshark_cmd += " -e x509sat.printableString -e x509sat.uTF8String"

!tshark -tud -n -r ./packet_capture_2.pcap {tshark_cmd} > packets.csv

In [28]:
def determineZone(ip):
    """
    This function determines if the IP 
    address is internal or public 
    according to RFC1918
    """
    
    if pd.notna(ip):
        try:
            return IPy(ip).iptype()
        except ValueError:
            return IPy(ip.split(',')[0]).iptype()
    return ip


#extract OUI from eth.src
df = pd.read_csv("./packets.csv", sep='\t')

df['oui.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
df['tail.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))
df['oui.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
df['tail.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))

df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.src',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
df.rename(columns={'Organization Name':'org.src'}, inplace=True)

df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.dst',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
df.rename(columns={'Organization Name':'org.dst'}, inplace=True)

# let's get rid of white spaces
df['org.src'] = df['org.src'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x)
df['org.dst'] = df['org.dst'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x)

df['zone.src'] = df['ip.src'].apply(lambda x: determineZone(x))
df['zone.dst'] = df['ip.dst'].apply(lambda x: determineZone(x))

# clean up IP addresses that have multiple source or dest values
df['ip.src'] = df['ip.src'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)
df['ip.dst'] = df['ip.dst'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)

df['protocol'] = df['frame.protocols'].apply(lambda x: x.split(':')[-1])



  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
# let's create a list of all the public IPs we see
public_IPs = set()
public_IPs_orgs = {}

for index,row in df.iterrows():
    if (row['zone.src'] == "PUBLIC"):
        public_IPs.add(row['ip.src'])
    if (row['zone.dst'] == "PUBLIC"):
        public_IPs.add(row['ip.dst'])
        
# this is a crazy hack... it is best I explain it live
# but all it does is to match a public IP to the Internet wide Organization (ASN) it belongs to

for ip in public_IPs:
    public_IPs_orgs[ip] = ASN_df[(ASN_df['start.dec'] <= float(IPy(ip).strDec())) & 
                                   (ASN_df['end.dec'] >= float(IPy(ip).strDec()))]['organization']\
                        .values[0].split(' ')[0]

In [31]:
# replace org data for public IPs based on the ASN information
df['org.src'] = df[['zone.src','ip.src','org.src']].apply(lambda x: ( public_IPs_orgs[x['ip.src']] if x['zone.src'] == "PUBLIC" else x['org.src']), axis=1)
df['org.dst'] = df[['zone.dst','ip.dst','org.dst']].apply(lambda x: ( public_IPs_orgs[x['ip.dst']] if x['zone.dst'] == "PUBLIC" else x['org.dst']), axis=1)

In [36]:
df['frame.protocols'].value_counts()

eth:ethertype:vlan:ethertype:ip:tcp                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    72634
eth:ethertype:vlan:ethertype:ip:tcp:ssl                                                                                                                                                                                                                                                                                                                                                                    

In [41]:
# protocols we see
# 'gquic', 'igmp', 'ssdp', 'x509ce', 'json'

def protoLang(line):
    proto = line['protocol']
    try:
        
         # x509ce (certificate exchange)
        if line['frame.protocols'].find('x509') > 0:
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            x509ce_message = ''
            if pd.notna(line['x509sat.printableString']):
                x509ce_message += line['x509sat.printableString']
            if pd.notna(line['x509sat.uTF8String']):
                x509ce_message += line['x509sat.uTF8String']
            return " ".join([src,srcport,proto,dst,dstport,x509ce_message])
        
        ## ssl 
        elif proto == 'ssl':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
                
        
        ## tcp:data
        elif line['frame.protocols'].find('tcp:data') > 0:
            proto = 'tcp:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## tcp:stun
        ## TODO: stun.att.software "stun.att.realm": "belkin.org",
        elif line['frame.protocols'].find('tcp:stun') > 0:
            proto = 'tcp:stun'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## tcp
        elif proto == 'tcp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## upd:data
        ## TODO: Detect broadcasts
        elif line['frame.protocols'].find('udp:data') > 0:
            proto = 'udp:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            if line['zone.dst'] == 'PRIVATE':
                if pd.notna(line['org.dst']):
                    dst =  " ".join([line['org.dst'],line['tail.dst']])
                else:
                    dst = ''
            elif line['zone.dst'] == 'PUBLIC':
                dst = line['org.dst']
            else:
                dst = ''
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## upd:nbns (netbios)
        ## TODO: nbns.name": "MACBOOKAIR-7040<00> (Workstation\/Redirector)",
        elif line['frame.protocols'].find('udp:nbns') > 0:
            proto = 'udp:nbns'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            #dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    #else line['org.dst']
            #dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto])
        
        ## upd:bootp (dhcp)
        ## TODO: "bootp.option.hostname": "amazon-c4475da2a"
        ## TODO: "bootp.type": "2" (1 is request, 2 is reply)
        ## if it is a reply, add the dst IP address too
        elif line['frame.protocols'].find('udp:bootp') > 0:
            proto = 'udp:bootp'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            #dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    #else line['org.dst']
            #dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto])
        
        ## udP:gquic
        ## TODO: "gquic.tag.sni": "0.docs.google.com"
        ## TODO: "gquic.tag.uaid": "Chrome\/65.0.3325.181 Intel Mac OS X 10_13_3" 
        ## TODO: "gquic.tag": "CHLO" (client hello)
        elif line['frame.protocols'].find('udp:quic') > 0:
            proto = 'udp:gquic'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## ssdp (simple service discovery protocol)
        ## TODO: http.server: "Linux UPnP\/1.0 Sonos\/41.3-50131 (ZPS12)
        ## TODO: http.unknown_header: "HOUSEHOLD.SMARTSPEAKER.AUDIO: Sonos_hOcMvZ0JBvDVZz7BXZc5ILQAT5.Cd7MOjIUy3HWHWEXItIZ\\r\\n",
        ## TODO: http.request.full_uri: "http:\/\/239.255.255.250:1900*",
        elif proto == 'ssdp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            if (line['zone.dst'] == 'PRIVATE'):
                dst = " ".join(line['org.dst'],line['tail.dst'])
            elif (line['zone.dst'] == 'RESERVED'):
                dst = line['eth.dst']
            else:
                dst = line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## db-lsp-disc:json (Dropbox Lan sync Discovery Protocol)
        ## TODO: Detect and use Broadcast
        ## TODO: eth.addr_resolved": "Broadcast"
        elif line['frame.protocols'].find('db-lsp-disc:json') > 0:
            proto = 'db-lsp-disc'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dstport])
        
        ## udp
        elif proto == 'udp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## ntp (network time protocol)
        elif proto == 'ntp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        
        ## ARP
        elif proto == 'arp':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if pd.notna(line['org.dst']) else '' 
            return " ".join([src,proto,dst])
            
        ## ICMPv6
        ## TODO: There is more information to be extracted here
        elif proto == 'icmpv6':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        ## STP (spanning tree protocol)
        elif proto == 'stp':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        ## mdns
        elif proto == 'mdns':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            srcport = str(int(line['udp.srcport']))
            dst = line['eth.dst']
            mdns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')    
            return " ".join([src,srcport,proto,mdns_string])
        
        ## igmp
        elif proto == 'igmp':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            dst = line['eth.dst']
            return " ".join([src,proto,dst])
        
        ## dns
        elif proto == 'dns':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            dns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')
            #print(dns_string)
            return " ".join([src,srcport,proto,dst,dstport,dns_string])

        ## http 
        elif proto == 'http':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            #print(payload)
            return " ".join([src,srcport,proto,dst,dstport,payload])
        
        ## http:data
        ## TODO: add http.file_data content
        elif line['frame.protocols'].find('http:data') > 0:
            proto = 'http:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            #print(payload)
            return " ".join([src,srcport,proto,dst,dstport,payload])
        
        ## http:media
        ## TODO: add  http.content_type , http.content_length
        elif line['frame.protocols'].find('http:media') > 0:
            proto = 'http:media'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        
        ## icmp:data
        ## TODO: Consider adding icmp.type to the frame / protocol language
        ## TODO: Detect broadcast
        elif line['frame.protocols'].find('icmp:data') > 0:
            proto = 'icmp:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            if line['zone.dst'] == 'PRIVATE':
                if pd.notna(line['org.dst']):
                    dst =  " ".join([line['org.dst'],line['tail.dst']])
                else:
                    dst = ""
            else:
                dst = line['org.dst']
            return " ".join([src,proto,dst])
        
        ## ethertype:data
        elif line['frame.protocols'].find('ethertype:data') > 0:
            proto = 'ethertype:data'
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        # anything else
        else:
            print(line['frame.protocols'])
            print("undetected protocol: {}".format(proto))
    except ValueError:
        print('Exception!!')
        print(line)
        
    

In [42]:
df['sentence'] = df.apply(protoLang, axis=1)

eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetected protocol: sigcomp
eth:ethertype:vlan:ethertype:ip:udp:sigcomp
undetec

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:st

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protocol: data
eth:ethertype:vlan:ethertype:ip:udp:stun:data
undetected protoco

In [34]:
vocabulary = []
for line in df['sentence'].tolist():
    vocabulary += line.split()
unique_words = set(vocabulary)

AttributeError: 'NoneType' object has no attribute 'split'

In [18]:
src_agg_df = df.groupby('eth.src')['sentence'].agg(lambda x: "".join(x + " <EOS> ")).reset_index()
dst_agg_df = df.groupby('eth.dst')['sentence'].agg(lambda x: "".join(x + " <EOS> ")).reset_index()

final_agg_df = src_agg_df.merge(dst_agg_df, left_on='eth.src', right_on='eth.dst', how = 'outer')
final_agg_df.columns = ["eth.src", "sentence.src", "eth.dst", "sentence.dst"]
#TODO: Filter reserved MAC ranges

final_agg_df['eth'] = final_agg_df[['eth.src','eth.dst']]\
                        .apply(lambda row: row['eth.src'] if pd.notna(row['eth.src']) else row['eth.dst'], axis=1)

#TODO: Redo merging sentences so the sequence of the language is preserved
final_agg_df['sentence'] = final_agg_df[['sentence.src', 'sentence.dst']]\
        .apply(lambda row: (row['sentence.src'] + " " if pd.notna(row['sentence.src']) else "") +\
                           (row['sentence.dst'] + " " if pd.notna(row['sentence.dst']) else ""), axis=1)

final_agg_df = final_agg_df.drop(['eth.src','sentence.src','eth.dst','sentence.dst'], axis=1)

final_agg_df = final_agg_df.merge(known_devices[list(['MAC address','Manufacturer Device Type'])], 
             left_on = 'eth',  right_on= 'MAC address', how = 'inner').drop(['eth','MAC address'], axis=1)
final_agg_df.columns = ['sentence','type']

## SKLearn - labels

In [19]:
le = preprocessing.LabelEncoder()
le.fit(final_agg_df['type'])
le.transform(final_agg_df['type'])
final_agg_df['label'] = le.transform(final_agg_df['type'])
final_agg_df = final_agg_df.drop(['type'], axis=1)

In [26]:
final_agg_df.to_csv("final_test.csv")

## Let the training start

In [20]:
# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.pandas_input_fn(
    final_agg_df, final_agg_df["label"], num_epochs=None, shuffle=True)

# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
    final_agg_df, final_agg_df["label"], shuffle=False)

In [21]:
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
    final_agg_df, final_agg_df["label"], shuffle=False)

In [22]:
embedded_text_feature_column = hub.text_embedding_column(
    key="sentence", 
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/nnlm-en-dim128/1'.
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/nnlm-en-dim128/1'.


In [23]:
estimator = tf.estimator.DNNClassifier(
    hidden_units=[500, 100],
    feature_columns=[embedded_text_feature_column],
    n_classes=len(le.classes_),
    optimizer=tf.train.AdagradOptimizer(learning_rate=0.003))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp58p1t4ir', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f93dca72710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [24]:
# Training for 1,000 steps means 128,000 training examples with the default
# batch size. This is roughly equivalent to 5 epochs since the training dataset
# contains 25,000 examples.
estimator.train(input_fn=train_input_fn, steps=1000);

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/sentence_hub_module_embedding/module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/32f2b2259e1cc8ca58c876921748361283e73997/variables/variables' with embeddings
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp58p1t4ir/model.ckpt.
INFO:tensorflow:loss = 301.00247, step = 1
INFO:tensorflow:global_step/sec: 7.75935
INFO:tensorflow:loss = 6.9243665, step = 101 (12.893 sec)
INFO:tensorflow:global_step/sec: 8.01889
INFO:tensorflow:loss = 3.015737, step = 201 (12.469 sec)
INFO:tensorflow:global_step/sec: 7.71707
INFO:tensorflow:loss = 1.3743415, step = 301 (12.958 sec)
INFO:tensorflow:global_step/sec: 7.82304
INFO:tensorflow:loss = 0.7175687, step = 401 (12.783 sec)

In [25]:
train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)

print("Training set accuracy: {accuracy}".format(**train_eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Initialize variable dnn/input_from_feature_columns/input_layer/sentence_hub_module_embedding/module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/32f2b2259e1cc8ca58c876921748361283e73997/variables/variables' with embeddings
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-07-01-04:02:29
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp58p1t4ir/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-07-01-04:02:31
INFO:tensorflow:Saving dict for global step 1000: accuracy = 1.0, average_loss = 0.0018464316, global_step = 1000, loss = 0.03323577
Training set accuracy: 1.0
