# IoT Project

### We are going to load up 2 databases  
#### Organization Unique Identifier (OUI)

OUI is a unique identifier per organization
The first 6 alphanumerical characters of the MAC address for each device is the OUI

In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict
from scapy.all import *
from IPy import IP as IPy
import pprint


OUI_df = pd.read_csv('oui.csv')
OUI_df.head()

Unnamed: 0,Registry,Assignment,Organization Name,Organization Address
0,MA-L,E043DB,"Shenzhen ViewAt Technology Co.,Ltd.","9A,Microprofit,6th Gaoxin South Road, High-Tec..."
1,MA-L,2405F5,Integrated Device Technology (Malaysia) Sdn. Bhd.,"Phase 3, Bayan Lepas FIZ Bayan Lepas Penang MY..."
2,MA-L,3CD92B,Hewlett Packard,11445 Compaq Center Drive Houston US 77070
3,MA-L,9C8E99,Hewlett Packard,11445 Compaq Center Drive Houston US 77070
4,MA-L,B499BA,Hewlett Packard,11445 Compaq Center Drive Houston US 77070


#### Internet IP address <-> Organization mapping

### Note:   
you need to download this file (large) to your local directory!  
https://iptoasn.com/data/ip2asn-combined.tsv.gz   
and unzip it (so you have a local copy of ip2asn-combined.tsv)

In [4]:
ASN_df = pd.read_csv("./ip2asn-combined.tsv", sep='\t', header=None)
ASN_df.columns = ['start','end','asn','country','organization']
ASN_df.head(100)

Unnamed: 0,start,end,asn,country,organization
0,0.0.0.1,0.255.255.255,0,,Not routed
1,1.0.0.0,9.255.255.255,0,,Not routed
2,10.0.0.0,10.255.255.255,0,,Not routed
3,11.0.0.0,100.63.255.255,0,,Not routed
4,100.64.0.0,100.127.255.255,0,,Not routed
5,100.128.0.0,126.255.255.255,0,,Not routed
6,127.0.0.0,127.255.255.255,0,,Not routed
7,128.0.0.0,169.253.255.255,0,,Not routed
8,169.254.0.0,169.254.255.255,0,,Not routed
9,169.255.0.0,172.15.255.255,0,,Not routed


In [5]:
# remove all the "Not routed" rows
ASN_df = ASN_df[ASN_df['organization'] != 'Not routed']

In [6]:
# add numerical representation for the start and end IP range for faster org search later
ASN_df['start.dec'] = ASN_df['start'].apply(lambda x: float(IPy(x).strDec()))
ASN_df['end.dec'] = ASN_df['end'].apply(lambda x: float(IPy(x).strDec()))

In [7]:
ASN_df.head()

Unnamed: 0,start,end,asn,country,organization,start.dec,end.dec
29,1.0.4.0,1.0.4.255,56203,AU,GTELECOM-AUSTRALIA Gtelecom-AUSTRALIA,16778240.0,16778495.0
30,1.0.5.0,1.0.5.255,56203,AU,GTELECOM-AUSTRALIA Gtelecom-AUSTRALIA,16778496.0,16778751.0
31,1.0.6.0,1.0.6.255,56203,AU,GTELECOM-AUSTRALIA Gtelecom-AUSTRALIA,16778752.0,16779007.0
32,1.0.7.0,1.0.7.255,56203,AU,GTELECOM-AUSTRALIA Gtelecom-AUSTRALIA,16779008.0,16779263.0
34,1.0.16.0,1.0.16.255,2519,Unknown,AS2519,16781312.0,16781567.0


In [8]:
# baseline
tshark_cmd = "-T fields -E header=y -e frame.number -e frame.time -e eth.src -e eth.dst -e frame.protocols"

#add IP/TCP/UDP/ICMP layers
tshark_cmd += " -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport"

# add DSN / mDNS layer
tshark_cmd += " -e dns.qry.name -e dns.resp.name -e dns.cname -e dns.a"

# add HTTP layer
tshark_cmd += " -e http.request.method -e http.request.uri -e http.user_agent -e http.host"

# add SSL certificate layer
tshark_cmd += " -e x509sat.printableString -e x509sat.uTF8String"


In [9]:
!tshark -tud -n -r ./packet_capture.pcap {tshark_cmd} > packets.csv

In [175]:
#!head -n 5 packets.csv 
#!tail -n 5 packets.csv

In [10]:
df = pd.read_csv("./packets.csv", sep='\t')

In [11]:
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,dns.qry.name,dns.resp.name,dns.cname,dns.a,http.request.method,http.request.uri,http.user_agent,http.host,x509sat.printableString,x509sat.uTF8String
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,,,,
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,,,,
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,,,,
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,,,,,,,,,
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,,,,,,,,,


In [12]:
#extract OUI from eth.src
df['oui.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
df['tail.src'] = df['eth.src'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))
df['oui.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
df['tail.dst'] = df['eth.dst'].apply(lambda x: ''.join(x.upper().split(':')[3:6]))

In [13]:
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,http.request.method,http.request.uri,http.user_agent,http.host,x509sat.printableString,x509sat.uTF8String,oui.src,tail.src,oui.dst,tail.dst
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,843835,5A7040,E0553D,024121
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,843835,5A7040,E0553D,024121
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,,,843835,5A7040,E0553D,024121
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,,,,,,E0553D,024121,843835,5A7040
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,,,,,,E0553D,024121,843835,5A7040


In [14]:
df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.src',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
df.rename(columns={'Organization Name':'org.src'}, inplace=True)

df = df.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'oui.dst',  right_on= 'Assignment', how = 'left').drop(['Assignment'], axis = 1)
df.rename(columns={'Organization Name':'org.dst'}, inplace=True)


In [15]:
# let's get rid of white spaces
df['org.src'] = df['org.src'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x)
df['org.dst'] = df['org.dst'].apply(lambda x: x.replace('.','').replace(',','').replace(" ", "-") if pd.notna(x) else x)

In [16]:
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,http.user_agent,http.host,x509sat.printableString,x509sat.uTF8String,oui.src,tail.src,oui.dst,tail.dst,org.src,org.dst
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,,,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,,,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc


In [17]:
# This function determines if the IP 
# address is internal or public 
# according to RFC1918

def determineZone(ip):
    if pd.notna(ip):
        try:
            return IPy(ip).iptype()
        except ValueError:
            return IPy(ip.split(',')[0]).iptype()
    return ip

In [18]:
df['zone.src'] = df['ip.src'].apply(lambda x: determineZone(x))
df['zone.dst'] = df['ip.dst'].apply(lambda x: determineZone(x))

In [19]:
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,x509sat.printableString,x509sat.uTF8String,oui.src,tail.src,oui.dst,tail.dst,org.src,org.dst,zone.src,zone.dst
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc,PUBLIC,PRIVATE
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc,PUBLIC,PRIVATE


In [20]:
# clean up IP addresses that have multiple source or dest values
df['ip.src'] = df['ip.src'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)
df['ip.dst'] = df['ip.dst'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)

In [21]:
df['protocol'] = df['frame.protocols'].apply(lambda x: x.split(':')[-1])
#df = df.drop(['frame.protocols'], axis = 1)
df.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,x509sat.uTF8String,oui.src,tail.src,oui.dst,tail.dst,org.src,org.dst,zone.src,zone.dst,protocol
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC,ssl
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC,ssl
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,843835,5A7040,E0553D,024121,Apple-Inc,Cisco-Meraki,PRIVATE,PUBLIC,ssl
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc,PUBLIC,PRIVATE,tcp
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,E0553D,024121,843835,5A7040,Cisco-Meraki,Apple-Inc,PUBLIC,PRIVATE,tcp


In [22]:
# let's create a list of all the public IPs we see
public_IPs = set()

In [23]:
for index,row in df.iterrows():
    if (row['zone.src'] == "PUBLIC"):
        public_IPs.add(row['ip.src'])
    if (row['zone.dst'] == "PUBLIC"):
        public_IPs.add(row['ip.dst'])

In [24]:
# this is a crazy hack... it is best I explain it live
# but all it does is to match a public IP to the Internet wide Organization (ASN) it belongs to
public_IPs_orgs = {}
for ip in public_IPs:
    public_IPs_orgs[ip] = ASN_df[(ASN_df['start.dec'] <= float(IPy(ip).strDec())) & 
                                   (ASN_df['end.dec'] >= float(IPy(ip).strDec()))]['organization']\
                        .values[0].split(' ')[0]

#public_IPs_orgs   

In [25]:
# replace org data for public IPs based on the ASN information
df['org.src'] = df[['zone.src','ip.src','org.src']].apply(lambda x: ( public_IPs_orgs[x['ip.src']] if x['zone.src'] == "PUBLIC" else x['org.src']), axis=1)
df['org.dst'] = df[['zone.dst','ip.dst','org.dst']].apply(lambda x: ( public_IPs_orgs[x['ip.dst']] if x['zone.dst'] == "PUBLIC" else x['org.dst']), axis=1)

In [26]:
df['protocol'].unique()

array(['ssl', 'tcp', 'data', 'quic', 'arp', 'icmpv6', 'mdns', 'stp',
       'dns', 'http', 'data-text-lines', 'ntp', 'media', 'stun', 'nbns',
       'bootp', 'igmp', 'pkcs-1', 'json'], dtype=object)

In [27]:
minidf = df.iloc[0:100,]
minidf.head()

Unnamed: 0,frame.number,frame.time,eth.src,eth.dst,frame.protocols,ip.src,ip.dst,tcp.srcport,tcp.dstport,udp.srcport,...,x509sat.uTF8String,oui.src,tail.src,oui.dst,tail.dst,org.src,org.dst,zone.src,zone.dst,protocol
0,1,"Apr 27, 2018 17:15:18.958000000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,843835,5A7040,E0553D,024121,Apple-Inc,FASTLY,PRIVATE,PUBLIC,ssl
1,2,"Apr 27, 2018 17:15:18.958052000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,843835,5A7040,E0553D,024121,Apple-Inc,FASTLY,PRIVATE,PUBLIC,ssl
2,3,"Apr 27, 2018 17:15:18.958053000 PDT",84:38:35:5a:70:40,e0:55:3d:02:41:21,eth:ethertype:vlan:ethertype:ip:tcp:ssl,192.168.1.14,151.101.1.254,59554.0,443.0,,...,,843835,5A7040,E0553D,024121,Apple-Inc,FASTLY,PRIVATE,PUBLIC,ssl
3,4,"Apr 27, 2018 17:15:18.969772000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,151.101.1.254,192.168.1.14,443.0,59554.0,,...,,E0553D,024121,843835,5A7040,FASTLY,Apple-Inc,PUBLIC,PRIVATE,tcp
4,5,"Apr 27, 2018 17:15:18.971427000 PDT",e0:55:3d:02:41:21,84:38:35:5a:70:40,eth:ethertype:vlan:ethertype:ip:tcp,216.58.195.243,192.168.1.14,443.0,59611.0,,...,,E0553D,024121,843835,5A7040,GOOGLE,Apple-Inc,PUBLIC,PRIVATE,tcp


In [28]:
# protocols we see
# 'gquic', 'igmp', 'ssdp', 'x509ce', 'json'

def protoLang(line):
    proto = line['protocol']
    try:
        
        # ssl 
        if proto == 'ssl':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
         # x509ce 
        if proto == 'x509ce':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            x509ce_message = ''
            if pd.notna(line['x509sat.printableString']):
                x509ce_message += line['x509sat.printableString']
            if pd.notna(line['x509sat.uTF8String']):
                x509ce_message += line['x509sat.printableString']
            return " ".join([src,srcport,proto,dst,dstport,x509ce_message])
        
        ## tcp:data
        elif line['frame.protocols'].find('tcp:data') > 0:
            proto = 'tcp:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## tcp:stun
        ## TODO: stun.att.software "stun.att.realm": "belkin.org",
        elif line['frame.protocols'].find('tcp:stun') > 0:
            proto = 'tcp:stun'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## tcp
        elif proto == 'tcp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## upd:data
        ## TODO: Detect broadcasts
        elif line['frame.protocols'].find('udp:data') > 0:
            proto = 'udp:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            if line['zone.dst'] == 'PRIVATE':
                if pd.notna(line['org.dst']):
                    dst =  " ".join([line['org.dst'],line['tail.dst']])
                else:
                    dst = ''
            elif line['zone.dst'] == 'PUBLIC':
                dst = line['org.dst']
            else:
                dst = ''
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## upd:nbns (netbios)
        ## TODO: nbns.name": "MACBOOKAIR-7040<00> (Workstation\/Redirector)",
        elif line['frame.protocols'].find('udp:nbns') > 0:
            proto = 'udp:nbns'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            #dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    #else line['org.dst']
            #dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto])
        
        ## upd:bootp (dhcp)
        ## TODO: "bootp.option.hostname": "amazon-c4475da2a"
        ## TODO: "bootp.type": "2" (1 is request, 2 is reply)
        ## if it is a reply, add the dst IP address too
        elif line['frame.protocols'].find('udp:bootp') > 0:
            proto = 'udp:bootp'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            #dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    #else line['org.dst']
            #dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto])
        
        ## udP:gquic
        ## TODO: "gquic.tag.sni": "0.docs.google.com"
        ## TODO: "gquic.tag.uaid": "Chrome\/65.0.3325.181 Intel Mac OS X 10_13_3" 
        ## TODO: "gquic.tag": "CHLO" (client hello)
        elif line['frame.protocols'].find('udp:gquic') > 0:
            proto = 'udp:gquic'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## ssdp (simple service discovery protocol)
        ## TODO: http.server: "Linux UPnP\/1.0 Sonos\/41.3-50131 (ZPS12)
        ## TODO: http.unknown_header: "HOUSEHOLD.SMARTSPEAKER.AUDIO: Sonos_hOcMvZ0JBvDVZz7BXZc5ILQAT5.Cd7MOjIUy3HWHWEXItIZ\\r\\n",
        ## TODO: http.request.full_uri: "http:\/\/239.255.255.250:1900*",
        elif proto == 'ssdp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else (line['org.dst'] if pd.notna(line['org.dst']) else line['ip.dst'])
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## db-lsp-disc:json (Dropbox Lan sync Discovery Protocol)
        ## TODO: Detect and use Broadcast
        ## TODO: eth.addr_resolved": "Broadcast"
        elif line['frame.protocols'].find('db-lsp-disc:json') > 0:
            proto = 'db-lsp-disc'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dstport])
        
        ## udp
        elif proto == 'udp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        ## ntp (network time protocol)
        elif proto == 'ntp':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        
        ## ARP
        elif proto == 'arp':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if pd.notna(line['org.dst']) else '' 
            return " ".join([src,proto,dst])
            
        ## ICMPv6
        ## TODO: There is more information to be extracted here
        elif proto == 'icmpv6':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        ## STP (spanning tree protocol)
        elif proto == 'stp':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        ## mdns
        elif proto == 'mdns':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            srcport = str(int(line['udp.srcport']))
            mdns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')    
            #print(mdns_string)
            return " ".join([src,srcport,proto,mdns_string])
        
        ## igmp
        elif proto == 'igmp':
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        ## dns
        elif proto == 'dns':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['udp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['udp.dstport']))
            dns_string =   (line['dns.qry.name'] + " " if pd.notna(line['dns.qry.name']) else '') + \
                            (line['dns.resp.name'] + " " if pd.notna(line['dns.resp.name']) else '') +\
                            (line['dns.cname'] + " " if pd.notna(line['dns.cname']) else '') + \
                            (line['dns.a'] if pd.notna(line['dns.a']) else '')
            #print(dns_string)
            return " ".join([src,srcport,proto,dst,dstport,dns_string])

        ## http 
        elif proto == 'http':
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            #print(payload)
            return " ".join([src,srcport,proto,dst,dstport,payload])
        
        ## http:data
        ## TODO: add http.file_data content
        elif line['frame.protocols'].find('http:data') > 0:
            proto = 'http:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            payload = (line['http.request.method'] + " " if pd.notna(line['http.request.method']) else '') + \
                    (line['http.request.uri'] + " " if pd.notna(line['http.request.uri']) else '') + \
                    (line['http.user_agent'] + " " if pd.notna(line['http.user_agent']) else '') + \
                    (line['http.host'] if pd.notna(line['http.host']) else '')
            #print(payload)
            return " ".join([src,srcport,proto,dst,dstport,payload])
        
        ## http:media
        ## TODO: add  http.content_type , http.content_length
        elif line['frame.protocols'].find('http:media') > 0:
            proto = 'http:media'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            srcport = str(int(line['tcp.srcport']))
            dst =  " ".join([line['org.dst'],line['tail.dst']]) if line['zone.dst'] == 'PRIVATE' \
                    else line['org.dst']
            dstport = str(int(line['tcp.dstport']))
            return " ".join([src,srcport,proto,dst,dstport])
        
        
        ## icmp:data
        ## TODO: Consider adding icmp.type to the frame / protocol language
        ## TODO: Detect broadcast
        elif line['frame.protocols'].find('icmp:data') > 0:
            proto = 'icmp:data'
            src =  " ".join([line['org.src'],line['tail.src']]) if line['zone.src'] == 'PRIVATE' \
                    else line['org.src']
            if line['zone.dst'] == 'PRIVATE':
                if pd.notna(line['org.dst']):
                    dst =  " ".join([line['org.dst'],line['tail.dst']])
                else:
                    dst = ""
            else:
                dst = line['org.dst']
            return " ".join([src,proto,dst])
        
        ## ethertype:data
        elif line['frame.protocols'].find('ethertype:data') > 0:
            proto = 'ethertype:data'
            src =  " ".join([line['org.src'],line['tail.src']]) 
            return " ".join([src,proto])
        
        # anything else
        else:
            print(line)
            return proto
    except ValueError:
        print('Exception!!')
        print(line)
        
    

In [32]:
df.iloc[5391,:]

frame.number                                                            5392
frame.time                               Apr 27, 2018 17:15:47.700474000 PDT
eth.src                                                    e0:55:3d:02:41:21
eth.dst                                                    84:38:35:5a:70:40
frame.protocols            eth:ethertype:vlan:ethertype:ip:tcp:ssl:pkcs-1...
ip.src                                                         74.119.117.82
ip.dst                                                          192.168.1.14
tcp.srcport                                                              443
tcp.dstport                                                            59647
udp.srcport                                                              NaN
udp.dstport                                                              NaN
dns.qry.name                                                             NaN
dns.resp.name                                                            NaN

In [36]:
#df['sentence'] = df.apply(protoLang, axis=1)
df.loc[5391,:]

frame.number                                                            5392
frame.time                               Apr 27, 2018 17:15:47.700474000 PDT
eth.src                                                    e0:55:3d:02:41:21
eth.dst                                                    84:38:35:5a:70:40
frame.protocols            eth:ethertype:vlan:ethertype:ip:tcp:ssl:pkcs-1...
ip.src                                                         74.119.117.82
ip.dst                                                          192.168.1.14
tcp.srcport                                                              443
tcp.dstport                                                            59647
udp.srcport                                                              NaN
udp.dstport                                                              NaN
dns.qry.name                                                             NaN
dns.resp.name                                                            NaN

In [31]:
df['sentence'].head(100)

0                 Apple-Inc 5A7040 59554 ssl FASTLY 443
1                 Apple-Inc 5A7040 59554 ssl FASTLY 443
2                 Apple-Inc 5A7040 59554 ssl FASTLY 443
3                 FASTLY 443 tcp Apple-Inc 5A7040 59554
4                 GOOGLE 443 tcp Apple-Inc 5A7040 59611
5                 Apple-Inc 5A7040 59611 ssl GOOGLE 443
6                 GOOGLE 443 tcp Apple-Inc 5A7040 59611
7                 FASTLY 443 tcp Apple-Inc 5A7040 59554
8                 Apple-Inc 31D358 65474 tcp GOOGLE 443
9                Apple-Inc 5A7040 58270 tcp DROPBOX 443
10                GOOGLE 443 tcp Apple-Inc 31D358 65474
11                FASTLY 443 ssl Apple-Inc 5A7040 59554
12                Apple-Inc 5A7040 59554 tcp FASTLY 443
13                 ICME 8080 tcp Apple-Inc 5A7040 59524
14          Apple-Inc 31D358 61502 tcp:data GOOGLE 4070
15                                                 quic
16          GOOGLE 4070 tcp:data Apple-Inc 31D358 61502
17               Apple-Inc 31D358 61502 tcp GOOG

In [892]:
vocabulary = df['sentence'].tolist()

In [893]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [894]:
data_index = 0
# generate batch data
def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    context = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window input_word skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # input word at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]  # this is the input word
            context[i * num_skips + j, 0] = buffer[target]  # these are the context words
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, context

In [895]:
vocabulary_size = 10000
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)
del vocabulary  # Hint to reduce memory.

batch_size = 128
embedding_size = 300  # Dimension of the embedding vector.
skip_window = 2       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of nega

In [902]:
len(dictionary)

488

In [896]:
import collections
import math
import os
import random
import zipfile
import datetime as dt

import tensorflow as tf

graph = tf.Graph()

In [897]:
with graph.as_default():
    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_context = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

    # Construct the variables for the softmax
    weights = tf.Variable(
        tf.truncated_normal([embedding_size, vocabulary_size], stddev=1.0 / math.sqrt(embedding_size)))
    biases = tf.Variable(tf.zeros([vocabulary_size]))
    hidden_out = tf.transpose(tf.matmul(tf.transpose(weights), tf.transpose(embed))) + biases

    # convert train_context to a one-hot format
    train_one_hot = tf.one_hot(train_context, vocabulary_size)

    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, labels=train_one_hot))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(cross_entropy)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.global_variables_initializer()

In [898]:
def run(graph, num_steps):
    with tf.Session(graph=graph) as session:
        # We must initialize all variables before we use them.
        init.run()
        print('Initialized')

        average_loss = 0
        for step in range(num_steps):
            batch_inputs, batch_context = generate_batch(data, batch_size, num_skips, skip_window)
            feed_dict = {train_inputs: batch_inputs, train_context: batch_context}

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            _, loss_val = session.run([optimizer, cross_entropy], feed_dict=feed_dict)
            average_loss += loss_val

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0

            # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0:
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = reverse_dictionary[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)
        final_embeddings = normalized_embeddings.eval()

In [899]:
num_steps = 100
softmax_start_time = dt.datetime.now()
run(graph, num_steps=num_steps)
softmax_end_time = dt.datetime.now()
print("Softmax method took {} seconds to run {} iterations".format((softmax_end_time-softmax_start_time).total_seconds(),num_steps))


Initialized
Average loss at step  0 :  9.422405242919922


KeyError: 4159

# Global list of device manufacturers 

Organization Unique Identifier, aka OUI data from the IEEE website:

Unnamed: 0,Registry,Assignment,Organization Name,Organization Address
0,MA-L,E043DB,"Shenzhen ViewAt Technology Co.,Ltd.","9A,Microprofit,6th Gaoxin South Road, High-Tec..."
1,MA-L,2405F5,Integrated Device Technology (Malaysia) Sdn. Bhd.,"Phase 3, Bayan Lepas FIZ Bayan Lepas Penang MY..."
2,MA-L,3CD92B,Hewlett Packard,11445 Compaq Center Drive Houston US 77070
3,MA-L,9C8E99,Hewlett Packard,11445 Compaq Center Drive Houston US 77070
4,MA-L,B499BA,Hewlett Packard,11445 Compaq Center Drive Houston US 77070


In [321]:
dframe.index.name = 'MacID'
dframe.reset_index(inplace=True)
dframe.head()

Unnamed: 0,MacID,DNS,HTTP,HTTPS,mDNS
0,00:18:0a:12:2f:2d,,{'199.231.78.217': 'b'\x17\x03\x01\x02 0<_7\xf...,,
1,28:f0:76:31:d3:58,clients6.google.com.,"{'104.154.127.3': 'b""\x93\x938\x1f\x0f\xe5\xb7...","[173.194.203.95 : GOOGLE - Google LLC, 216.58....",
2,40:cb:c0:bc:36:7e,,,,"40CBC0BC367E@Apple TV._raop._tcp.local.,40CBC0..."
3,44:65:0d:90:60:3a,,"{'104.154.127.60': 'b""M]'3\x13\x8c\x93\xbaoO\x...","[52.46.136.77 : AMAZON-02 - Amazon.com, Inc.]",
4,5c:aa:fd:4c:87:a0,,,"[54.208.201.219 : AMAZON-AES - Amazon.com, Inc.]",


In [322]:
# Let's extract the Organization Unique Identifier, aka OUI from the MAC address
# which is simply the first 6 alphanumerical characters
dframe['OUI'] = dframe['MacID'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))

# Merging the manufacturer information from the master database (downloaded from the IEEE website)
# with our dataframe
dframe = dframe.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'OUI',  right_on= 'Assignment', how = 'left')\
            [list(['MacID', 'DNS', 'HTTP', 'HTTPS', 'mDNS', 'OUI', 'Organization Name'])]
dframe.head()

Unnamed: 0,MacID,DNS,HTTP,HTTPS,mDNS,OUI,Organization Name
0,00:18:0a:12:2f:2d,,{'199.231.78.217': 'b'\x17\x03\x01\x02 0<_7\xf...,,,00180A,Cisco Meraki
1,28:f0:76:31:d3:58,clients6.google.com.,"{'104.154.127.3': 'b""\x93\x938\x1f\x0f\xe5\xb7...","[173.194.203.95 : GOOGLE - Google LLC, 216.58....",,28F076,"Apple, Inc."
2,40:cb:c0:bc:36:7e,,,,"40CBC0BC367E@Apple TV._raop._tcp.local.,40CBC0...",40CBC0,"Apple, Inc."
3,44:65:0d:90:60:3a,,"{'104.154.127.60': 'b""M]'3\x13\x8c\x93\xbaoO\x...","[52.46.136.77 : AMAZON-02 - Amazon.com, Inc.]",,44650D,Amazon Technologies Inc.
4,5c:aa:fd:4c:87:a0,,,"[54.208.201.219 : AMAZON-AES - Amazon.com, Inc.]",,5CAAFD,"Sonos, Inc."


## Device labeling  
The packet traces we are using come from my home network. I know what these devices are. I am going to import the device information for these devices and merge it with the network traffic dataframe we are using.

In [323]:
known_devices = pd.read_csv('known_devices.csv')
known_devices.head()

Unnamed: 0,Description,Last seen,Usage,OS,IPv4 address,Policy,MAC address,Manufacturer,Device Type
0,Daghan deskphone,6/6/18 11:29,962619.1,Meraki,192.168.1.236,normal,00:18:0a:12:2f:2d,Meraki,Phone
1,Nest,6/6/18 11:30,83495.0,Raspberry Pi,192.168.1.235,normal,18:b4:30:0d:87:18,Nest Labs,Thermostat
2,iPhone,5/19/18 18:11,4825600.0,Apple iPhone,192.168.1.9,Privacy please,18:f6:43:57:fc:8f,Apple,iPhone
3,Daghan-Home,6/6/18 11:30,31267720.0,Mac OS X 10.13,192.168.1.233,normal,28:f0:76:31:d3:58,Apple,Computer
4,DaghanspleWatch,6/5/18 5:43,267294.0,iOS,192.168.1.7,normal,30:63:6b:65:e4:04,Apple,Watch


In [324]:
known_devices['OUI'] = known_devices['MAC address'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
known_devices['Manufacturer Device Type'] = ([' '.join(row) for row in 
                        zip(known_devices["Manufacturer"],known_devices["Device Type"])])
known_devices

Unnamed: 0,Description,Last seen,Usage,OS,IPv4 address,Policy,MAC address,Manufacturer,Device Type,OUI,Manufacturer Device Type
0,Daghan deskphone,6/6/18 11:29,962619.1,Meraki,192.168.1.236,normal,00:18:0a:12:2f:2d,Meraki,Phone,00180A,Meraki Phone
1,Nest,6/6/18 11:30,83495.0,Raspberry Pi,192.168.1.235,normal,18:b4:30:0d:87:18,Nest Labs,Thermostat,18B430,Nest Labs Thermostat
2,iPhone,5/19/18 18:11,4825600.0,Apple iPhone,192.168.1.9,Privacy please,18:f6:43:57:fc:8f,Apple,iPhone,18F643,Apple iPhone
3,Daghan-Home,6/6/18 11:30,31267720.0,Mac OS X 10.13,192.168.1.233,normal,28:f0:76:31:d3:58,Apple,Computer,28F076,Apple Computer
4,DaghanspleWatch,6/5/18 5:43,267294.0,iOS,192.168.1.7,normal,30:63:6b:65:e4:04,Apple,Watch,30636B,Apple Watch
5,daghan-macbook-8,6/6/18 11:21,26054380.0,Mac OS X 10.13,192.168.1.21,Privacy please,34:36:3b:7f:3f:a0,Apple,Computer,34363B,Apple Computer
6,Daghan-Home,6/6/18 11:29,215554.9,Mac OS X,192.168.1.224,normal,38:c9:86:40:7c:a6,Apple,Computer,38C986,Apple Computer
7,Apple-TV-3,6/3/18 19:35,1.0,iOS,192.168.1.247,normal,40:cb:c0:bc:36:7c,Apple,TV,40CBC0,Apple TV
8,Apple-TV-3,6/6/18 11:29,643408.9,iOS,192.168.1.249,normal,40:cb:c0:bc:36:7e,Apple,TV,40CBC0,Apple TV
9,44:65:0d:66:72:60,6/6/18 1:22,357305.0,Generic Linux,192.168.1.19,normal,44:65:0d:66:72:60,Amazon Technologies,Echo,44650D,Amazon Technologies Echo


In [325]:
dframe.head()

Unnamed: 0,MacID,DNS,HTTP,HTTPS,mDNS,OUI,Organization Name
0,00:18:0a:12:2f:2d,,{'199.231.78.217': 'b'\x17\x03\x01\x02 0<_7\xf...,,,00180A,Cisco Meraki
1,28:f0:76:31:d3:58,clients6.google.com.,"{'104.154.127.3': 'b""\x93\x938\x1f\x0f\xe5\xb7...","[173.194.203.95 : GOOGLE - Google LLC, 216.58....",,28F076,"Apple, Inc."
2,40:cb:c0:bc:36:7e,,,,"40CBC0BC367E@Apple TV._raop._tcp.local.,40CBC0...",40CBC0,"Apple, Inc."
3,44:65:0d:90:60:3a,,"{'104.154.127.60': 'b""M]'3\x13\x8c\x93\xbaoO\x...","[52.46.136.77 : AMAZON-02 - Amazon.com, Inc.]",,44650D,Amazon Technologies Inc.
4,5c:aa:fd:4c:87:a0,,,"[54.208.201.219 : AMAZON-AES - Amazon.com, Inc.]",,5CAAFD,"Sonos, Inc."


## Merging the device labels with the packet trace dataframe

In [326]:
# We are going to merge the labels from the known devices (my home) dataframe
dframe = dframe.merge(known_devices[list(['MAC address', 'Device Type','Manufacturer Device Type'])], 
             left_on = 'MacID',  right_on= 'MAC address', how = 'left').drop('MAC address', axis=1)

In [327]:
#let's drop NAs (rows without any label)
#I'll investigate what's going on later
dframe = dframe.dropna(0, subset = ['Manufacturer Device Type'])
dframe

Unnamed: 0,MacID,DNS,HTTP,HTTPS,mDNS,OUI,Organization Name,Device Type,Manufacturer Device Type
0,00:18:0a:12:2f:2d,,{'199.231.78.217': 'b'\x17\x03\x01\x02 0<_7\xf...,,,00180A,Cisco Meraki,Phone,Meraki Phone
1,28:f0:76:31:d3:58,clients6.google.com.,"{'104.154.127.3': 'b""\x93\x938\x1f\x0f\xe5\xb7...","[173.194.203.95 : GOOGLE - Google LLC, 216.58....",,28F076,"Apple, Inc.",Computer,Apple Computer
2,40:cb:c0:bc:36:7e,,,,"40CBC0BC367E@Apple TV._raop._tcp.local.,40CBC0...",40CBC0,"Apple, Inc.",TV,Apple TV
3,44:65:0d:90:60:3a,,"{'104.154.127.60': 'b""M]'3\x13\x8c\x93\xbaoO\x...","[52.46.136.77 : AMAZON-02 - Amazon.com, Inc.]",,44650D,Amazon Technologies Inc.,Echo,Amazon Technologies Echo
4,5c:aa:fd:4c:87:a0,,,"[54.208.201.219 : AMAZON-AES - Amazon.com, Inc.]",,5CAAFD,"Sonos, Inc.",Speaker,Sonos Speaker
5,5c:aa:fd:4c:92:86,,,"[54.163.161.36 : AMAZON-AES - Amazon.com, Inc.]",,5CAAFD,"Sonos, Inc.",Speaker,Sonos Speaker
6,78:28:ca:03:80:0c,,{'151.101.40.246': 'b'GET /audio/7fd0fe06a28db...,"[107.22.76.192 : AMAZON-AES - Amazon.com, Inc.]",,7828CA,"Sonos, Inc.",Speaker,Sonos Speaker
7,84:38:35:5a:70:40,"googleads.g.doubleclick.net.,cm.g.doubleclick....","{'185.217.0.110': '', '104.154.127.116': 'b'\x...","[172.217.6.65 : GOOGLE - Google LLC, 172.217.6...",,843835,"Apple, Inc.",Computer,Apple Computer
9,88:71:e5:d2:73:4b,,{'104.154.126.239': 'b'\xb2\xf4\xdf\xc1\xac\xe...,"[52.46.132.50 : AMAZON-02 - Amazon.com, Inc., ...",,8871E5,Amazon Technologies Inc.,Echo,Amazon Technologies Echo
10,b4:7c:9c:31:b4:df,"spectrum.s3.amazonaws.com.,ntp-g7g.amazon.com....","{'93.184.216.34': '', '52.216.17.168': '', '52...","[54.239.27.11 : AMAZON-02 - Amazon.com, Inc., ...","linux-6.local.,linux-6.local.,Android.local.,l...",B47C9C,Amazon Technologies Inc.,Echo,Amazon Technologies Echo


In [340]:
dframe.iloc[1]['HTTPS']

['173.194.203.95 : GOOGLE - Google LLC',
 '216.58.194.165 : GOOGLE - Google LLC',
 '52.201.158.18 : AMAZON-AES - Amazon.com, Inc.',
 '18.204.151.40 : AMAZON-AES - Amazon.com, Inc.',
 '172.217.6.39 : GOOGLE - Google LLC',
 '172.217.164.106 : GOOGLE - Google LLC',
 '104.86.220.69 : CMCS - Comcast Cable Communications, LLC']

In [341]:
ls(UDP)

sport      : ShortEnumField                      = (53)
dport      : ShortEnumField                      = (53)
len        : ShortField                          = (None)
chksum     : XShortField                         = (None)


In [None]:
packet[UDP].len

# Ignore everything below

In [211]:
import pycurl
import pprint
import json
from io import BytesIO


c = pycurl.Curl()
c.setopt(c.URL, 'https://ipinfo.io/216.58.195.243?token=4b22562280ac95')

e = BytesIO()
c.setopt(pycurl.WRITEFUNCTION, e.write)
c.perform()
c.close()
htmlString = json.loads(e.getvalue())
pprint.pprint(htmlString)

{'city': 'Mountain View',
 'country': 'US',
 'hostname': 'sfo03s06-in-f19.1e100.net',
 'ip': '216.58.195.243',
 'loc': '37.4192,-122.0570',
 'org': 'AS15169 Google LLC',
 'postal': '94043',
 'region': 'California'}


In [210]:
c = pycurl.Curl()
c.setopt(c.URL, 'https://api.iptoasn.com/v1/as/ip/216.58.195.243')
e = io.BytesIO()
c.setopt(pycurl.WRITEFUNCTION, e.write)
c.perform()
c.close()
htmlString = e.getvalue().decode('UTF-8')
print(htmlString)

{"announced":true,"as_country_code":"US","as_description":"GOOGLE - Google LLC","as_number":15169,"first_ip":"216.58.192.0","ip":"216.58.195.243","last_ip":"216.58.199.255"}


In [223]:
import pprint
import requests

dictionary = requests.get('https://ipinfo.io/216.58.195.243?token=4b22562280ac95').json()
tmp = {k: dictionary[k] for k in ('org', 'hostname')}
pprint.pprint(tmp)



#dictionary = requests.get('https://api.iptoasn.com/v1/as/ip/216.58.195.243').json()["hostname"]
#pprint.pprint(dictionary)



{'hostname': 'sfo03s06-in-f243.1e100.net', 'org': 'AS15169 Google LLC'}


In [None]:
For each packet, create a sentence
