In [1]:
import pandas as pd
from scapy.all import PcapReader
import os
from datetime import datetime
import numpy as np
from tqdm import tqdm

from scapy.layers.dot11 import Dot11



In this jupyter notebook we want to read out our recorded data and then use the data to produce various plots.

To start, we first need to write a function that can read out our data.
Here we decided to directly readout the data as it is, but we will probably use csv files instead. 

In [41]:
def read_pcapng(file_path):
    """
    This function reads the wireshark packets directly and stores them in a pandas data frame
    :param file_path: This is the path to the to be read file
    :return: Pandas dataframe with the read out data
    """
    timestamps, senders, receivers, signal_strengths = [], [], [], []
    
    for pkt in PcapReader(filepath):
        if pkt.haslayer('Dot11'):
            timestamp = datetime.fromtimestamp(float(pkt.time))
            rssi = pkt.getlayer('RadioTap').dBm_AntSignal
            sender, receiver = pkt.addr2, pkt.addr1
    
            timestamps.append(timestamp)
            senders.append(sender)
            receivers.append(receiver)
            signal_strengths.append(rssi)

    df = pd.DataFrame({
        'Timestamp': timestamps,
        'Source': senders,
        'Destination': receivers,
        'Signal-Strength': signal_strengths
    })
    
    return df


In the following cell, we are using the function, to read out one file.
To find this file, we have to give the path were it is located.

In [24]:

filepath = os.getcwd()
# print(filepath)
filepath += "\\Lecture-Data\\Lecture-12-12_00002_20231212082631.pcapng" # Lecture data
# filepath += "\\lecture-data-090124\\lecture090124_00001_20240109103111.pcapng"
# filepath += "\\lecture-data-090124\\lecture-data-090124.csv"

df = read_pcapng(filepath)
# df = pd.read_csv(filepath)

print(df)


                      Timestamp             Source        Destination  \
0    2023-12-12 08:26:31.261669  ea:f5:71:1e:46:bc  00:f6:63:ad:8a:a0   
1    2023-12-12 08:26:31.294627  ea:f5:71:1e:46:bc  00:f6:63:ad:8a:a0   
2    2023-12-12 08:26:31.879087  ea:f5:71:1e:46:bc  00:f6:63:ad:8a:a0   
3    2023-12-12 08:26:31.897903               None  ea:f5:71:1e:46:bc   
4    2023-12-12 08:26:32.782836  3a:cf:d1:b9:84:d6  ff:ff:ff:ff:ff:ff   
...                         ...                ...                ...   
2425 2023-12-12 08:36:23.525919               None  ac:bd:70:d8:71:cc   
2426 2023-12-12 08:36:25.542764               None  26:3b:22:93:ac:e6   
2427 2023-12-12 08:36:26.894979  16:cf:27:01:cb:7b  ff:ff:ff:ff:ff:ff   
2428 2023-12-12 08:36:26.975547  16:cf:27:01:cb:7b  ff:ff:ff:ff:ff:ff   
2429 2023-12-12 08:36:26.995736  16:cf:27:01:cb:7b  ff:ff:ff:ff:ff:ff   

      Signal-Strength  
0                 -81  
1                 -83  
2                 -84  
3                 -63  
4  

Now we want to know how many unique addresses there are. 
For that we first merge all Source and Destination addresses together, so that we can see all possible mac addresses.
Additionally, we want to remove all None values from our data. 

Then we can use the pandas dataframe to get the count of all unique values. 

In [25]:
addresses = pd.concat((df['Source'].astype(str), df['Destination'].astype(str)))
addresses = addresses.drop(addresses[addresses == 'None'].index)

print(addresses.nunique())

474


To see how much noise we have in the data, we are then going to count how many times a unique address is showing up. 
For that, we loop over all unique addresses and store the number of occurrences in a new np array.
The number of occurrences is mapped to the unique addresses on an index basis. To match the address to the count, we need to match it to the index it has in unique_addresses.

In [28]:
unique_addresses = addresses.unique()
addresses_counts = np.zeros(addresses.nunique())
for index, address in enumerate(unique_addresses):
    addresses_counts[index] = addresses[addresses == address].count()

print(addresses_counts)

addresses_counts.sort()
print(addresses_counts[:-1])
filtered_addresses_count = addresses_counts[np.where(addresses_counts > 3)]
print(filtered_addresses_count)

[3.000e+00 2.000e+00 2.000e+00 1.000e+00 6.000e+00 2.000e+00 3.000e+00
 2.000e+00 2.000e+00 1.000e+00 2.000e+00 9.400e+01 2.000e+00 1.000e+00
 3.000e+00 1.000e+00 2.000e+00 2.000e+00 2.000e+00 2.000e+00 1.000e+00
 1.200e+01 1.000e+00 2.000e+00 1.000e+00 1.000e+00 1.000e+00 2.000e+00
 1.000e+00 1.000e+00 2.000e+00 1.000e+00 2.000e+00 1.200e+01 2.000e+00
 1.000e+00 1.000e+00 2.000e+00 1.000e+01 1.000e+01 1.000e+00 1.000e+00
 2.000e+00 2.000e+00 1.000e+00 1.000e+00 2.000e+00 4.000e+00 1.000e+00
 2.000e+00 2.000e+00 6.000e+00 2.000e+00 2.000e+00 1.000e+00 8.000e+00
 4.000e+00 1.000e+00 2.000e+00 1.000e+00 2.000e+00 2.000e+00 4.000e+00
 4.000e+00 7.000e+00 1.000e+00 7.000e+00 2.000e+00 1.000e+00 1.000e+00
 1.000e+00 7.400e+01 3.000e+00 3.000e+00 4.000e+00 2.000e+00 1.000e+00
 1.000e+00 1.000e+00 3.000e+00 6.400e+01 3.000e+00 1.000e+00 2.000e+00
 1.000e+00 2.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
 1.000e+00 2.000e+00 1.000e+00 1.420e+02 1.400e+01 1.000e+00 1.000e+00
 2.000

To read multiple files at once, we wrote a function, that can read multiple files at once. 
To spead up that process, we use multiprocessing, so that we can read multiple files in parallel.

There is also a new itteration of the filepath, so that the user can change it here directly, so that he can quickly read out other data in other folders.

In [42]:
import glob
from concurrent.futures import ThreadPoolExecutor

filepath = os.getcwd()
# print(filepath)
filepath += "\\Lecture-Data"


def read_multiple_pcapng(directory):
    """
    This function is meant to read out multiple files at once
    :param directory: This is the path to the folder that we want to read out
    :return: A pandas dataframe with the data of multiple files.
    """
    # Define the path to the directory containing the pcapng files
    path = os.path.join(directory, "*.pcapng")

    # Identify all pcapng files in the directory
    all_files = glob.glob(path)

    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        # Map the read_single_pcapng function to all_files
        results = list(executor.map(read_pcapng, all_files))

    # Concatenate the DataFrames obtained from individual files
    df = pd.concat(results, ignore_index=True)

    return df


# Call the function
df = read_multiple_pcapng(filepath)

print(df)


                       Timestamp             Source        Destination  \
0     2023-12-12 08:26:31.261669  ea:f5:71:1e:46:bc  00:f6:63:ad:8a:a0   
1     2023-12-12 08:26:31.294627  ea:f5:71:1e:46:bc  00:f6:63:ad:8a:a0   
2     2023-12-12 08:26:31.879087  ea:f5:71:1e:46:bc  00:f6:63:ad:8a:a0   
3     2023-12-12 08:26:31.897903               None  ea:f5:71:1e:46:bc   
4     2023-12-12 08:26:32.782836  3a:cf:d1:b9:84:d6  ff:ff:ff:ff:ff:ff   
...                          ...                ...                ...   
99888 2023-12-12 08:15:03.673286               None  d0:16:b4:fc:98:c3   
99889 2023-12-12 08:15:03.802218  de:38:e5:24:8d:9d  ff:ff:ff:ff:ff:ff   
99890 2023-12-12 08:15:05.086811  6e:36:4c:8b:83:ab  ff:ff:ff:ff:ff:ff   
99891 2023-12-12 08:15:05.599234  fa:c6:91:03:73:0b  ff:ff:ff:ff:ff:ff   
99892 2023-12-12 08:15:05.621162  fa:c6:91:03:73:0b  ff:ff:ff:ff:ff:ff   

       Signal-Strength  
0                  -81  
1                  -83  
2                  -84  
3          

Here we are counting again, how many unique addresses there are.

In [44]:
addresses = pd.concat((df['Source'].astype(str), df['Destination'].astype(str)))

print(addresses.nunique())
print(df['Source'].nunique())
print(df['Destination'].nunique())

12073
11764
842


To filter out noise again, we want to find out how many times a unique address shows up.

To speed up the process, we first convert the pandas dataframe to a numpy array. 

In [45]:
unique_addresses = addresses.unique()
addresses_counts = np.zeros((addresses.nunique(), 2))
unique_addresses_count = addresses.nunique()
addresses = addresses.to_numpy()

To see if there are addresses, that show up during the whole data set, as a sanity check, we are going to iterate over all unique addresses and give out the first and last index of the address. 

In [9]:
for index, address in tqdm(enumerate(unique_addresses), total=unique_addresses_count):
    indices = np.where(addresses == address)
    addresses_counts[index, 1] = indices[0][0]
    addresses_counts[index, 0] = indices[0][-1]

print(addresses_counts)

100%|██████████| 12073/12073 [01:13<00:00, 163.23it/s]


[[1.98946e+05 0.00000e+00]
 [9.98880e+04 3.00000e+00]
 [5.00000e+00 4.00000e+00]
 ...
 [1.98525e+05 1.98496e+05]
 [1.99269e+05 1.99227e+05]
 [1.99324e+05 1.99324e+05]]


Here we want to find out how many times a unique address shows up.

In [12]:
for index, address in tqdm(enumerate(unique_addresses), total=unique_addresses_count):
    addresses_counts[index] = len(np.where(addresses == address)[0])

print(addresses_counts)

100%|██████████| 12073/12073 [01:10<00:00, 170.43it/s]


[[1.3900e+02 1.3900e+02]
 [4.9183e+04 4.9183e+04]
 [2.0000e+00 2.0000e+00]
 ...
 [5.0000e+00 5.0000e+00]
 [8.0000e+00 8.0000e+00]
 [1.0000e+00 1.0000e+00]]


Here we are calculating the mean occurrence of all addresses. 

In [14]:
print(addresses_counts.mean(axis=0))

[16.54816533 16.54816533]
