Copyright (C) 2020-2024 - Raytheon BBN Technologies Corp.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0.

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
either express or implied. See the License for the specific
language governing permissions and limitations under the License.

Distribution Statement "A" (Approved for Public Release,
Distribution Unlimited).

This material is based upon work supported by the Defense
Advanced Research Projects Agency (DARPA) under Contract No.
HR001119C0102.  The opinions, findings, and conclusions stated
herein are those of the authors and do not necessarily reflect
those of DARPA.


# IP Device Data Processor
This file combines new Device types data with the existing device types database

Before running this, install and run lift https://github.com/trylinux/lift and save output(s) to * .txt * file(s)
The output of the lift generated text file will be processed in this file and combined with the device types database. This outputs two different files:
1. Updated devices database file
2. File containing frequency counts of all the words in results from lift

In [None]:
## Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import os
import re
from collections import Counter

### User Defined Variables

In [None]:

# Set value to 1 to see intermediate outputs for debugging. 0 otherwise (recommended)
debug = 0

# ports of interest in the data
ports = [80, 443]

device_types = ['Server', 'Router', 'Recorder', 'Camera', 'NVR', 'HVR', 'DVR']

# Full path to directory where lift extracted device data is stored (note it will read all .txt files in that directory)
dir_path = r'/home/nice-user/lift-data/'

# File Name (with Path) where all the IPs sent for finding devices are stored
ip_file = r'outputs/IPsForDeviceTypes.txt'

# File (with Path) where the most commonly occuring files will be 
word_counts_file = 'outputs/word_counts.csv'

# (Optional) Set 1 if you have an existing Geolocations database to merge, 0 otherwise
existing_device_types_db = 0

# File Name (with Path) where the Devices database will be written. 
# Please note that if you are merging an existing Geolocations DB, it will read this file, merge the new data with it, 
# and then overwrite it. 
devices_db_file = 'db/full_devices_db.csv'


### Initialization of Other Variables

In [None]:
# Initialize variables
device_df = pd.DataFrame(columns=['Original_Device_Info','Timestamp', 'IP', 'Port', 'Device_Info', 'Device_Type'])

if(dir_path[len(dir_path)-1] != '/'):
    dir_path = dir_path + '/'
    
if debug == 1:
    print(dir_path)
    display(device_df)


### Read and Process the Data

In [None]:
# String formatting function

def process_string(line):
    line = line.replace('/', ' ')
    line = line.replace('=', ' ')
    line = line.replace('"', ' ')
    line = line.replace('(', ' ')
    line = line.replace(')', ' ')
    line = line.replace('[', ' ')
    line = line.replace(']', ' ')
    line = line.replace(':', ' ')
    line = line.replace('|', ' ')
    line = ' '.join(line.split())
    line = line.lower()
    return line

In [None]:
print("Reading Lift output data")

# list to store files
file_list = []

# Iterate directory and find all compatible files
for file in os.listdir(dir_path):
    # check only text files and add them to the list
    if file.endswith('.txt'):
        file_list.append(file)

# if there is 1 or more file type of interest
if len(file_list)>=1:

    for file in range(0, len(file_list)):
        full_file_path = dir_path+file_list[file]
        
        if debug == 1:
            print('\n full_file_path: ', full_file_path)


        # Read data from the file only if the file is not empty, i.e. file size > 0
        if os.path.getsize(full_file_path) > 0:
            
            count = 0
            with open(full_file_path, encoding="utf8") as fp:
                Lines = fp.readlines()
                for line in Lines:
                    line = line.replace('\n', '')

                    # If the line is a new device line with timestamp
                    if line[0].isdigit():
                        
                        # Extract and clean the different fields of data
                        
                        [time_stamp, ip_port, device] = line.split("|", 2)
                        device = process_string(device)
                        ip_port = ip_port.replace(' ', '')
                        [ip, port] = ip_port.split(":")
                        if debug == 1:
                            print(count, time_stamp, ip, port, device)

                        # Add a new row to the dataframe
                        current_row = len(device_df)
                        device_df.loc[current_row, 'Original_Device_Info'] = line
                        device_df.loc[current_row, 'Timestamp'] = time_stamp
                        device_df.loc[current_row, 'IP'] = ip                        
                        device_df.loc[current_row, 'Port'] = port
                        device_df.loc[current_row, 'Device_Info'] = device

                    # If the line is a continuation of the previous device
                    else:

                        # Extract and clean the different fields of data
                        device = process_string(line)
                        if debug == 1:
                            print(count, device)

                        # Update the previous row of the dataframe
                        update_row = len(device_df)-1
                        
                        previous_val = device_df.loc[update_row, 'Original_Device_Info']
                        device_df.loc[update_row, 'Original_Device_Info'] = f'{previous_val} {line}'
                        
                        
                        previous_val = device_df.loc[update_row, 'Device_Info']
                        device_df.loc[update_row, 'Device_Info'] = f'{previous_val} {device}'

                    count += 1

In [None]:
device_df = device_df.drop_duplicates(subset=['IP'])
device_df['Port'] = device_df['Port'].astype(int)


if debug == 1:
    display(device_df)


### Generate and save file with the recurrence of each word 

In [None]:
# Count the number of words and print them to a file 

# Combine all description rows into a single string
all_devices = " ".join(device_df["Device_Info"].tolist())

word_counts = Counter(all_devices.split())
   
word_counts_dict = dict(word_counts)
word_counts_df = pd.DataFrame([word_counts_dict]).T.reset_index()
word_counts_df = word_counts_df.rename(columns = {'index': 'Word', 0: 'Count'})

word_counts_df.to_csv(word_counts_file, index = False)

if debug == 1:
    display(word_counts_df)

### Assign Device Types

In [None]:

if len(device_types) == 1:
    device_df['Device_Type'] = device_types[0]
elif len(device_types) > 1:
    device_df['Device_Type'] = device_types[0]
    for i in range(1, len(device_types)):
        device_df.loc[device_df['Device_Info'].str.contains(device_types[i].lower()), 'Device_Type'] = device_types[i]
else: 
    device_df['Device_Type'] = 'Server'

if debug == 1:
    display(device_df)


### Merge with Existing Device Types (if needed) and write the device types database

In [None]:
if existing_device_types_db == 1:
    try:
        print("Merging with Previous Geolocations data")

        database_device_types = pd.read_csv(devices_db_file)

        if debug == 1:
            display(database_device_types)
    except:
        sys.exit(f'Problem with reading geolocations database file ({devices_db_file}). Check if the path is correct')
    

In [None]:
if existing_device_types_db == 1:
    if len(device_df) > 0:
        full_db = pd.concat([database_device_types, device_df], ignore_index = True)
        full_db = full_db.drop_duplicates(subset=['IP'], keep='last')
    else:
        full_db = database_device_types
    
    if debug == 1:
        display(full_db)

In [None]:
if existing_device_types_db == 1:
    print("Printing new Database to file")
    try:
        full_db.to_csv(devices_db_file, index=False)
    except:
        print(f'Problem writing file ({devices_db_file}). Check if the path is correct or if the file is open elsewhere.')

### Read all the original IPs and Compare Data

In [None]:
print("Comparing Lift Output to all IPs sent as input to Lift to see how much data the results were returned for")

all_ips = pd.read_csv(ip_file, header=None)
all_ips = all_ips.rename(columns = {0:'IP'})

if debug == 1:
    display(all_ips)

In [None]:
precent_values_found = pd.DataFrame(columns = ['Count', 'Percent of Total'])

unique_ports = device_df['Port'].unique()
print(unique_ports)

total_unique_ips = len(all_ips)
for port in unique_ports:
    current_df = device_df[device_df['Port'] == port]
    current_df_len = len(current_df)
    
    current_index = len(precent_values_found)
    precent_values_found.loc[port, 'Count'] = current_df_len
    precent_values_found.loc[port, 'Percent of Total'] = current_df_len / total_unique_ips * 100
    

if debug == 1:
    display(precent_values_found)
    


In [None]:
plt.figure()
precent_values_found['Count'].plot.bar(figsize=(14,6), title='Counts of IPs that Returned Responses by Port')
plt.xlabel('Port Number')
plt.ylabel('Count')

plt.figure()
precent_values_found['Percent of Total'].plot.bar(figsize=(14,6), title=f'Percent of IPs that Returned Responses from {total_unique_ips} Total')
plt.xlabel('Port Number')
plt.ylabel('Percent (Value between 0 and 100) ')

In [None]:
# Breakdown of Device Types
device_type_counts = device_df['Device_Type'].value_counts()

if debug == 1:
    display(device_type_counts)

plt.figure()
device_type_counts.plot.bar(title='Device Types in Data')
plt.xlabel('Device Types')
plt.ylabel('Counts')
