In [1]:
import numpy as np
import pandas as pd
import pprint
from scapy.all import *
from collections import defaultdict
from IPy import IP as IPy
from manuf import manuf

In [2]:
# change this to the filename that you want to convert to a dataframe

filename = "./packet_capture.pcap"

In [3]:
packets = rdpcap(filename)

d = defaultdict(dict)

for packet in packets:
       
    # Ignore IPv6 for now
    if (packet.haslayer(IPv6)):
        continue 
    
    # DNS
    if(packet.haslayer(DNS)):
        dns_type = 'DNS'
        mac_id = packet["Ether.dst"]
        
        if (packet.haslayer(UDP)):
            
            if (packet[UDP].sport == 5353):
                dns_type = 'mDNS'
                mac_id = packet["Ether.src"]
        
        if(packet[DNS].haslayer(DNSRR)):
            rrname = packet[DNS][DNSRR].rrname.decode("utf-8")
            
            if mac_id not in d:
                d[mac_id] = {dns_type: rrname}
            elif dns_type in d[mac_id]:
                d[mac_id][dns_type] += "," + rrname
            else:
                d[mac_id][dns_type] = rrname 
                
    # HTTP:
    elif(packet.haslayer(IP)):
        
        if (packet.haslayer(TCP)):
            mac_id = packet["Ether.src"]
            IP_src = packet["IP.src"]
            IP_dst = packet["IP.dst"]
            
            if (packet[TCP].dport == 443):
                
                # This is https, just record high level data
                if ((IPy(IP_src).iptype() == "PRIVATE") and (IPy(IP_dst).iptype() == "PUBLIC")):
                    if mac_id not in d:
                        d[mac_id] = {'HTTPS': {IP_dst}}
                    elif 'HTTPS' in d[mac_id]:
                        d[mac_id]['HTTPS'].add(IP_dst)
                    else:
                        d[mac_id]['HTTPS'] = {IP_dst}
            else:
                
                # This is http, record more data including the payload
                if(len(packet[TCP].payload) > 10):
                    payload = str(packet[TCP].payload)
                else:
                    payload = ""
                
                if ((IPy(IP_src).iptype() == "PRIVATE") and (IPy(IP_dst).iptype() == "PUBLIC")):
                    if mac_id not in d:
                        d[mac_id] = {'HTTP': {IP_dst: payload}}
                    else:
                        if 'HTTP' in d[mac_id]:
                            if IP_dst in d[mac_id]['HTTP']:
                                d[mac_id]['HTTP'][IP_dst] +=  payload
                            else:
                                d[mac_id]['HTTP'][IP_dst] = payload 
                        else:
                            d[mac_id]['HTTP']= {IP_dst : payload}  
    
    # Else: What did I miss
    else:
        pass 
    
dframe = pd.DataFrame.transpose(pd.DataFrame(d))
dframe.head()

Unnamed: 0,DNS,HTTP,HTTPS,mDNS
00:18:0a:12:2f:2d,,{'199.231.78.217': 'b'\x17\x03\x01\x02 0<_7\xf...,,
28:f0:76:31:d3:58,clients6.google.com.,"{'104.154.127.3': 'b""\x93\x938\x1f\x0f\xe5\xb7...","{172.217.6.39, 18.204.151.40, 172.217.164.106,...",
40:cb:c0:bc:36:7e,,,,"40CBC0BC367E@Apple TV._raop._tcp.local.,40CBC0..."
44:65:0d:90:60:3a,,"{'104.154.127.60': 'b""M]'3\x13\x8c\x93\xbaoO\x...",{52.46.136.77},
5c:aa:fd:4c:87:a0,,,{54.208.201.219},


In [4]:
OUI_df = pd.read_csv('oui.csv')
dframe.index.name = 'MacID'
dframe.reset_index(inplace=True)

# Let's extract the Organization Unique Identifier, aka OUI from the MAC address
# which is simply the first 6 alphanumerical characters
dframe['OUI'] = dframe['MacID'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))

# Merging the manufacturer information from the master database (downloaded from the IEEE website)
# with our dataframe
dframe = dframe.merge(OUI_df[list(['Assignment', 'Organization Name'])], 
             left_on = 'OUI',  right_on= 'Assignment', how = 'left')\
            [list(['MacID', 'DNS', 'HTTP', 'HTTPS', 'mDNS', 'OUI', 'Organization Name'])]


In [5]:
known_devices = pd.read_csv('known_devices.csv')
known_devices['OUI'] = known_devices['MAC address'].apply(lambda x: ''.join(x.upper().split(':')[0:3]))
known_devices['Manufacturer Device Type'] = ([' '.join(row) for row in 
                        zip(known_devices["Manufacturer"],known_devices["Device Type"])])

# We are going to merge the labels from the known devices (my home) dataframe
dframe = dframe.merge(known_devices[list(['MAC address', 'Device Type','Manufacturer Device Type'])], 
             left_on = 'MacID',  right_on= 'MAC address', how = 'left').drop('MAC address', axis=1)

#let's drop NAs (rows without any label)
#I'll investigate what's going on later
dframe = dframe.dropna(0, subset = ['Manufacturer Device Type'])

In [6]:
dframe

Unnamed: 0,MacID,DNS,HTTP,HTTPS,mDNS,OUI,Organization Name,Device Type,Manufacturer Device Type
0,00:18:0a:12:2f:2d,,{'199.231.78.217': 'b'\x17\x03\x01\x02 0<_7\xf...,,,00180A,Cisco Meraki,Phone,Meraki Phone
1,28:f0:76:31:d3:58,clients6.google.com.,"{'104.154.127.3': 'b""\x93\x938\x1f\x0f\xe5\xb7...","{172.217.6.39, 18.204.151.40, 172.217.164.106,...",,28F076,"Apple, Inc.",Computer,Apple Computer
2,40:cb:c0:bc:36:7e,,,,"40CBC0BC367E@Apple TV._raop._tcp.local.,40CBC0...",40CBC0,"Apple, Inc.",TV,Apple TV
3,44:65:0d:90:60:3a,,"{'104.154.127.60': 'b""M]'3\x13\x8c\x93\xbaoO\x...",{52.46.136.77},,44650D,Amazon Technologies Inc.,Echo,Amazon Technologies Echo
4,5c:aa:fd:4c:87:a0,,,{54.208.201.219},,5CAAFD,"Sonos, Inc.",Speaker,Sonos Speaker
5,5c:aa:fd:4c:92:86,,,{54.163.161.36},,5CAAFD,"Sonos, Inc.",Speaker,Sonos Speaker
6,78:28:ca:03:80:0c,,{'151.101.40.246': 'b'GET /audio/7fd0fe06a28db...,{107.22.76.192},,7828CA,"Sonos, Inc.",Speaker,Sonos Speaker
7,84:38:35:5a:70:40,"googleads.g.doubleclick.net.,cm.g.doubleclick....","{'185.217.0.110': '', '104.154.127.116': 'b'\x...","{172.217.164.110, 151.101.194.2, 172.217.14.99...",,843835,"Apple, Inc.",Computer,Apple Computer
9,88:71:e5:d2:73:4b,,{'104.154.126.239': 'b'\xb2\xf4\xdf\xc1\xac\xe...,"{52.46.132.50, 34.236.127.105}",,8871E5,Amazon Technologies Inc.,Echo,Amazon Technologies Echo
10,b4:7c:9c:31:b4:df,"spectrum.s3.amazonaws.com.,ntp-g7g.amazon.com....","{'93.184.216.34': '', '52.216.17.168': '', '52...","{52.46.156.66, 54.239.27.11, 54.239.27.116, 34...","linux-6.local.,linux-6.local.,Android.local.,l...",B47C9C,Amazon Technologies Inc.,Echo,Amazon Technologies Echo


In [7]:
dframe.to_csv('init_data.csv')