Copyright (C) 2020-2024 - Raytheon BBN Technologies Corp.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0.

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
either express or implied. See the License for the specific
language governing permissions and limitations under the License.

Distribution Statement "A" (Approved for Public Release,
Distribution Unlimited).

This material is based upon work supported by the Defense
Advanced Research Projects Agency (DARPA) under Contract No.
HR001119C0102.  The opinions, findings, and conclusions stated
herein are those of the authors and do not necessarily reflect
those of DARPA.


# ExtractUniqueSourceAddresses
This file reads all the unique source addresses in the given meanie data and prints two text files that contain:
1. IPs we don't have geolocation for
2. IPs we don't have device info for

In [None]:
# Imports
import pandas as pd
import numpy as np
import ipaddress
import os
import sys

### User Defined Variables

In [None]:
# Set value to 1 to see intermediate outputs for debugging. 0 otherwise (recommended)
debug = 0

# Specify the /24 address of interest as a.b.c
destination_address_of_interest = '10.1.202'

# (Optional) Set 1 if you have an existing Geolocations database to merge, 0 otherwise
existing_geolocations_db = 0

# (Optional) Location of the current geolocation database file with path, must be specified if previous input is 1
geo_location_db_file = 'db/full_geolocations_db.csv'

# (Optional) Set 1 if you have an existing device types database to merge, 0 otherwise
existing_device_types_db = 0

# (Optional) Location of the current device types database file with path, must be specified if previous input is 1
device_types_db_file = 'db/full_devices_db.csv'

# Directory where results are printed
output_dir_path = r'outputs'

# File name of the file that will contain addresses missing geolocation 
ipsForGeoLocation = 'IPsForGeoLocation.txt'

# File name of the file that will contain addresses missing device types  
ipsForDeviceTypes = 'IPsForDeviceTypes.txt'

# years in the data that should be analyzed
years_of_interest = [2020, 2021, 2022, 2023]

# Directory where meanie text data is stored
input_dir_path = r'/home/nice-user/Yearly'

### Read, Format, Filter, and Iterate over all the Meanie Data of Interest

In [None]:
# generate all 256 /24 addresses for the given subnet
all_dest_addresses = [f'{destination_address_of_interest}.{x}' for x in range(0,256)]

if(input_dir_path[len(input_dir_path)-1] != '/'):
    input_dir_path = input_dir_path + '/'

if(output_dir_path[len(output_dir_path)-1] != '/'):
    output_dir_path = output_dir_path + '/'
    
if debug == 1:
    print(input_dir_path)
    print(output_dir_path)
    

In [None]:
# Initialize variables
orig_data = pd.DataFrame()
data_stats = {}
stat_columns = ['Source_Address', 'Destination_Address', 'TTL', 'Destination_Port', 'Payload_Length']
hex_columns = ['Source_Address', 'Destination_Address', 'Source_Port', 'Destination_Port', 'UDP_Checksum', 'TTL', 'IPID']
ip_columns = ['Source_Address', 'Destination_Address']

In [None]:
# Method to compute and store statistics of data before and after filtering
def compute_data_stats(orig_data, years_of_interest, stat_columns, prefix):
    for year in years_of_interest:
        current_year_data = orig_data[orig_data['Year'] == year]
        data_stats[year] = {f'{prefix}_Packet_Counts': len(current_year_data)}
        for column in stat_columns:
            data_stats[year][f'{prefix}_Unique_{column}_Counts'] = current_year_data[column].nunique()
            data_stats[year][f'{prefix}_Unique_{column}s'] = current_year_data[column].unique()

In [None]:

# values in Input data

# Saddr - Hex
# Daddr - Hex
# Sport - Hex
# Dport - Hex
# Proto – always UDP (17)
# Timestamp
# UdpCksum - Hex
# PayloadLen – the length of the payload, in bytes (the UDP length field, minus 8 for the UDP header itself)
# Payload – Hex
# TTL - a hex
# IPID - a hex

print("Reading Meanie Data")

# list to store files
file_list = []

try:
    all_files = os.listdir(input_dir_path)
except:
    sys.exit('Problem with input directory. Check if the path is correct')

# Iterate directory and find all compatible files
for file in all_files:
    # check only text files and add them to the list
    if file.endswith('.txt'):
        file_list.append(file)

# if there is 1 or more file type of interest
if len(file_list)>=1:

    for file in range(0, len(file_list)):
        full_file_path = input_dir_path+file_list[file]
        
        if debug == 1:
            print('full_file_path: ', full_file_path)


        # if filesize > 0, i.e. file is not empty
        if os.path.getsize(full_file_path) > 0:
            current_csv = pd.read_csv(full_file_path, header=None)
            current_csv['FileName'] = file_list[file]
            orig_data = pd.concat([orig_data, current_csv], ignore_index=True)

else:
    sys.exit("No compatible Files (.txt) in the directory")

if debug == 1:
    display(orig_data)


In [None]:

# if the data is not empty
if len(orig_data)>=1:

    # Format all the read data
    orig_data = orig_data.rename(columns={0: 'Source_Address', 1:'Destination_Address', 2:'Source_Port', 
                                          3:'Destination_Port', 4:'Protocol', 5:'Timestamp', 
                                          6:'UDP_Checksum', 7:'Payload_Length', 8:'Payload', 9:'TTL', 10:'IPID'})
    
    for hc in hex_columns:
        orig_data[hc] = orig_data[hc].apply(lambda x: int(x, 16))
        
    for ic in ip_columns:
        orig_data[ic] = orig_data[ic].apply(lambda x: str(ipaddress.ip_address(x)))
        
    orig_data[['Prefix', 'Year', 'Month', 'Day', 'Hour', 'Post']] = orig_data['FileName'].str.split('-', expand=True)
    orig_data = orig_data.drop(['Prefix', 'Post'], axis=1)
    orig_data[['Year', 'Month', 'Day', 'Hour']] = orig_data[['Year', 'Month', 'Day', 'Hour']].astype(int)
    
    if debug == 1:
        display(orig_data)
    
    # Extract Stats about the full data
    compute_data_stats(orig_data, years_of_interest, stat_columns, 'All')
    
    if debug == 1:            
        print('Before Filtering')
        for year in years_of_interest:
            print(f'{year} Num Unique Destination Addresses: ', data_stats[year]['All_Unique_Destination_Address_Counts'])
            print(f'{year} Unique Destination Addresses: ', data_stats[year]['All_Unique_Destination_Addresss'])
    
    
    # filter the data by address of interest
    orig_data = orig_data[orig_data["Destination_Address"].isin(all_dest_addresses)]

    if debug == 1:
        display(orig_data)
    
    # Extract Stats about the filtered data
    compute_data_stats(orig_data, years_of_interest, stat_columns, 'Filtered')
    
    if debug == 1:            
        print('\nAfter Filtering')
        for year in years_of_interest:
            print(f'{year} Num Unique Destination Addresses: ', data_stats[year]['Filtered_Unique_Destination_Address_Counts'])
            print(f'{year} Unique Destination Addresses: ', data_stats[year]['Filtered_Unique_Destination_Addresss'])

else:
    sys.exit('\nInput Meanie Data is Empty.\n')
    

### Find out all the Source Addresses without GeoLocation

In [None]:

if existing_geolocations_db == 1:
    print("\nComparing to Existing GeoLocations Database\n")
    
    try:
        source_geo_locations = pd.read_csv(geo_location_db_file)
        source_geo_locations = source_geo_locations.rename(columns={'IP': 'Source_Address'})

        if debug == 1:
            source_geo_locations.head()
            source_geo_locations.info()
    
    except:
        sys.exit(f'\nProblem reading geolocations database file ({geo_location_db_file}). Check if the path is correct\n')

In [None]:
print("\nPrinting Geolocations File\n")

if existing_geolocations_db == 1:
    try:
        remaining_data = orig_data.loc[~orig_data['Source_Address'].isin(source_geo_locations['Source_Address']), 'Source_Address']
        remaining_data.to_csv(f'{output_dir_path}{ipsForGeoLocation}', header=False, index=False)
    except:
        print('\nProblem writing geolocations IP file. Check if the output path is correct\n')
else: 
    try:
        remaining_data = orig_data['Source_Address']
        remaining_data.to_csv(f'{output_dir_path}{ipsForGeoLocation}', header=False, index=False)
    except:
        print('\nProblem writing geolocations IP file. Check if the output path is correct\n')

if debug == 1:
    print('len(remaining_data): ', len(remaining_data))
    display(remaining_data)


### Find out all the Source Addresses without Device Types


In [None]:
if existing_device_types_db == 1:
    print("\nComparing to Existing Device Types Database\n")
    
    try:
        source_devices = pd.read_csv(device_types_db_file)

        if debug == 1:
            source_devices.head()
            source_devices.info()
    except:
        sys.exit(f'\nProblem reading device types database file ({device_types_db_file}). Check if path is correct\n')

In [None]:
print("\nPrinting Device Types File\n")
if existing_device_types_db == 1:    
    try:
        remaining_data = orig_data.loc[~orig_data['Source_Address'].isin(source_devices['IP']), 'Source_Address']
        remaining_data.to_csv(f'{output_dir_path}{ipsForDeviceTypes}', header=False, index=False)
    except:
        print('\nProblem writing device types IP file. Check if the output path is correct\n')
else:
    try:
        remaining_data = orig_data['Source_Address']
        remaining_data.to_csv(f'{output_dir_path}{ipsForDeviceTypes}', header=False, index=False)
    except:
        print('\nProblem writing device types IP file. Check if the output path is correct\n')

if debug == 1:
    print('len(remaining_data): ', len(remaining_data))
    display(remaining_data)
