Copyright (C) 2020-2024 - Raytheon BBN Technologies Corp.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0.

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
either express or implied. See the License for the specific
language governing permissions and limitations under the License.

Distribution Statement "A" (Approved for Public Release,
Distribution Unlimited).

This material is based upon work supported by the Defense
Advanced Research Projects Agency (DARPA) under Contract No.
HR001119C0102.  The opinions, findings, and conclusions stated
herein are those of the authors and do not necessarily reflect
those of DARPA.


# Greenwich Meanie Daily and Weekly Analysis on a Specified /24 Destination Subnet

Runs analysis on the Meanine data to see Weekly and Daily patterns on a given /24 subnet.

<b>Input data provided should only include one week per year<b>

In [None]:
from datetime import datetime
import warnings
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import ipaddress
import os
import math
from textwrap import wrap

# Ignoring warnings, otherwise there are warnings about having too many graphs
warnings.filterwarnings('ignore')

### User Defined Variables

In [None]:

# Set value to 1 to see intermediate outputs for debugging. 0 otherwise (recommended)
debug = 0

# /24 of the destination addresses to be kept
destination_address_of_interest = '10.1.202'

# Set value to 1 to do geolocation analysis using results generated with 'ipinfo.io'. 0 otherwise (recommended)
# If this is 1, add the path to the geolocations database in the next variable 
analyze_geolocation = 0

# full path and filename that contains prior Geolocation database
geo_location_db_file = 'db/full_geolocations_db.csv'

# Parent directory where meanie text data is stored
input_dir_path = r'/home/nice-user/Weekly'

# Directory where meanie results are printed
output_dir_path = r'/home/nice-user/Weekly'

# Set value to 1 to save figures to output_dir_path. 0 otherwise
save_figs = 1

# Year in the data 
years_of_interest = [2020, 2021, 2022, 2023]

# Countries we want to extract Information for
countries_of_interest = ["United Kingdom", "Philippines", "Brazil", "India", "Greece", "United States", "Canada", 
                         "Serbia", "Poland", "India", "Croatia", "Indonesia", "South Africa", "China", "Russia", 
                         "Singapore", "Argentina", "Australia", "Czech Republic", "Bangladesh", "France", "Malaysia", "Turkey"]

# Timezone offset for countries_of_interest relative to the US Eastern Timezone
countries_timezones = {"United Kingdom": 5, "Philippines": 12, "Brazil": 1, "India": 9.5, "Greece": 7, 
                       "United States": -1, "Canada": -1, "Serbia": 6, "Poland": 6, "Croatia": 6, 
                       "Indonesia": 12, "South Africa": 6, "China": 12, "Russia":7, "Singapore": 13, 
                       "Argentina": 2, "Australia": 14, "Czech Republic":6, "Bangladesh":10, "France": 6, 
                       "Malaysia": 12, "Turkey": 7}


### Initialization of Variables and Function Declarations

In [None]:

# Columns in the meanie text files that have data as hex
hex_columns = ['Source_Address', 'Destination_Address', 'Source_Port', 'Destination_Port', 'UDP_Checksum', 'TTL', 'IPID']

# Columns from meanie text that will need to be formatted as IP addresses
ip_columns = ['Source_Address', 'Destination_Address']

# Columns for computing before and after filtering statistics
stat_columns = ['Source_Address', 'Destination_Address', 'TTL', 'Destination_Port', 'Payload_Length', 'Destination_Slash24']

orig_data = pd.DataFrame()
data_stats = {}

for year in years_of_interest:
    data_stats[year] = {}

if(input_dir_path[len(input_dir_path)-1] != '/'):
    input_dir_path = input_dir_path + '/'
    
if(output_dir_path[len(output_dir_path)-1] != '/'):
    output_dir_path = output_dir_path + '/'

all_dest_addresses = [f'{destination_address_of_interest}.{x}' for x in range(0,256)]

if debug == 1:
    print(input_dir_path)
    print(data_stats)


In [None]:
# Function that computes Stats on the data before and after filtering

def compute_data_stats(orig_data, years_of_interest, stat_columns, prefix):
    for year in years_of_interest:
        current_year_data = orig_data[orig_data['Year'] == year]
        current_year_unique_months = current_year_data['Month'].nunique()
        if debug == 1:
            print(year, ' unique months: ', current_year_unique_months)
        data_stats[year][f'{prefix}_Packet_Counts'] = len(current_year_data)
        data_stats[year][f'{prefix}_Packet_norm'] = data_stats[year][f'{prefix}_Packet_Counts'] / current_year_unique_months
        for column in stat_columns:
            data_stats[year][f'{prefix}_Unique_{column}_Counts'] = current_year_data[column].nunique()
            data_stats[year][f'{prefix}_Unique_{column}s'] = current_year_data[column].unique()
            data_stats[year][f'{prefix}_Unique_{column}_norm'] = data_stats[year][f'{prefix}_Unique_{column}_Counts'] / current_year_unique_months

### Read / Clean / Format / Merge Data and Compute Filtering Stats

In [None]:
## Read the Meanie Text Data
# values in Input data

# Saddr - Hex
# Daddr - Hex
# Sport - Hex
# Dport - Hex
# Proto – always UDP (17)
# Timestamp
# UdpCksum - Hex
# PayloadLen – the length of the payload, in bytes (the UDP length field, minus 8 for the UDP header itself)
# Payload – Hex
# TTL - a hex
# IPID - a hex

# for folder in data_folders:
print('Reading Meanie Data')

# list to store files
file_list = []

# Iterate directory and find all compatible files
for file in os.listdir(input_dir_path):
    # check only text files and add them to the list
    if file.endswith('.txt'):
        file_list.append(file)

# if there is 1 or more file type of interest
if len(file_list)>=1:

    for file in range(0, len(file_list)):
        full_file_path = input_dir_path+file_list[file]

        # Read data from the file only if the file is not empty, i.e. file size > 0
        if os.path.getsize(full_file_path) > 0:
            current_csv = pd.read_csv(full_file_path, header=None)
            current_csv['FileName'] = file_list[file]
            orig_data = pd.concat([orig_data, current_csv], ignore_index=True)

else:
    print("No compatible Files in the directory")

if debug == 1:
    display(orig_data)


In [None]:

# if the data is not empty
if len(orig_data)>=1:

    # Format all the read data
    orig_data = orig_data.rename(columns={0: 'Source_Address', 1:'Destination_Address', 2:'Source_Port', 
                                          3:'Destination_Port', 4:'Protocol', 5:'Packet_Timestamp', 
                                          6:'UDP_Checksum', 7:'Payload_Length', 8:'Payload', 9:'TTL', 10:'IPID'})
    
    for hc in hex_columns:
        orig_data[hc] = orig_data[hc].apply(lambda x: int(x, 16))
        
    for ic in ip_columns:
        orig_data[ic] = orig_data[ic].apply(lambda x: str(ipaddress.ip_address(x)))
        
    orig_data[['Prefix', 'Year', 'Month', 'Day', 'Hour', 'Post']] = orig_data['FileName'].str.split('-', expand=True)
    orig_data = orig_data.drop(['Prefix', 'Post'], axis=1)
    orig_data['Data_Date'] = orig_data['Year'] + '-' + orig_data['Month'] + '-' + orig_data['Day']
    orig_data['Data_Date'] = pd.to_datetime(orig_data['Data_Date'], format='%Y-%m-%d').dt.date
    orig_data[['Year', 'Month', 'Day', 'Hour']] = orig_data[['Year', 'Month', 'Day', 'Hour']].astype(int)
    orig_data['Destination_Slash24'] = orig_data['Destination_Address'].apply(lambda row: '.'.join(row.split('.')[:3]))
    
    orig_data['Packet_Timestamp'] = [datetime.fromtimestamp(x) for x in list(orig_data['Packet_Timestamp'])]
    
    
    if debug == 1:
        display(orig_data)
    
    # Extract Stats about the full data
    compute_data_stats(orig_data, years_of_interest, stat_columns, 'All')
    
    if debug == 1:
        print('Before Filtering')
        for year in years_of_interest:
            print(f'{year} Num Unique Destination Addresses: ', data_stats[year]['All_Unique_Destination_Address_Counts'])
    
    # Keep a copy of the full data without filtering
    unfiltered_data = orig_data
    
    
    # filter the data by subnet of interest
    orig_data = orig_data[orig_data["Destination_Address"].isin(all_dest_addresses)]
    # Extract Stats about the filtered data
    compute_data_stats(orig_data, years_of_interest, stat_columns, 'Filtered')
    
    if debug == 1:
        print('\nAfter Filtering')
        for year in years_of_interest:
            print(f'{year} Num Unique Destination Addresses: ', data_stats[year]['Filtered_Unique_Destination_Address_Counts'])
    

In [None]:
## Read Geolocations Database
if analyze_geolocation == 1:
    print("Merging with GeoLocations data")

    database_geo_locations = pd.read_csv(geo_location_db_file)
    database_geo_locations = database_geo_locations.rename(columns={'IP': 'Source_Address'})

    if debug == 1:
        display(database_geo_locations)

In [None]:
if analyze_geolocation == 1:
    if debug == 1:
        database_geo_locations.info()

In [None]:
# Merge the two tables 
if analyze_geolocation == 1:
    merged_table = pd.merge(orig_data, database_geo_locations, on="Source_Address", how = 'left')

    if debug == 1:
        display(merged_table)


In [None]:
if analyze_geolocation == 1:
    if debug == 1:
        merged_table.info()
        merged_table['Country'].value_counts()

In [None]:
# Delete unnecessary columns
if analyze_geolocation == 1:
    subset_table = merged_table.drop(['Timestamp', 'Payload'], axis=1)

    if debug == 1:
        print('subset_table.size', subset_table.size)
        subset_table.head()

In [None]:
# Remove duplicate rows so each source exists only once for each year and location 
if analyze_geolocation == 1:
    subset_table = subset_table.drop_duplicates()

    if debug == 1:
        print('subset_table.size', subset_table.size)
        display(subset_table)

In [None]:
if analyze_geolocation == 1:
    if debug == 1:
        subset_table.info()

### Weekly Geolocation Maps

In [None]:
if analyze_geolocation == 1:
    continent_coordinates = {'World': {'xlim':[-180, 180], 'ylim':[-90, 90]},
                             'Europe': {'xlim':[-20, 50], 'ylim':[25, 70]},
                             'Asia': {'xlim':[60, 145], 'ylim':[-10, 50]},
                             'North America': {'xlim':[-130, -60], 'ylim':[15, 60]},
                             'South America': {'xlim':[-85, -30], 'ylim':[-55, 15]},
                             'Africa': {'xlim':[-35, 60], 'ylim':[-40,40]},
                             'Australia': {'xlim':[110, 187], 'ylim':[-50, -5]}
    }


In [None]:
if analyze_geolocation == 1:
    unique_source_counts_by_year_location = subset_table.groupby(['Year', 'Country', 'Region', 'City', 'Latitude', 'Longitude'])['Source_Address'].nunique().reset_index()
    unique_source_counts_by_year_location.to_csv(f'{output_dir_path}CityCountsByWeek.csv', index = False)


In [None]:
if analyze_geolocation == 1:
    unique_years = unique_source_counts_by_year_location['Year'].unique()
    max_count = unique_source_counts_by_year_location['Source_Address'].max()

    continents = list(continent_coordinates.keys())
    for current_year in unique_years:
        current_year_df = unique_source_counts_by_year_location.loc[unique_source_counts_by_year_location['Year']==current_year]
        min_date = subset_table.loc[subset_table['Year']==current_year, 'Data_Date'].min()
        max_date = subset_table.loc[subset_table['Year']==current_year, 'Data_Date'].max()

        for continent in continents:

            fig, ax = plt.subplots(figsize=(12,6))

            # Plotting world map
            worldmap = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
            worldmap.plot(color="lightgrey", ax=ax)
            ax.set_axis_off()

            longitudes = current_year_df['Longitude']
            latitudes = current_year_df['Latitude']
            counts = current_year_df['Source_Address']
            # Plotting longitudes and latitudes
            plt.scatter(longitudes, latitudes, s=counts, c=counts, alpha=0.6, vmin=0, vmax=max_count, cmap='autumn')
            plt.colorbar(label="Count of Unique Source Addresses by City")
            plt.xlim(continent_coordinates[continent]['xlim'])
            plt.ylim(continent_coordinates[continent]['ylim'])
            plt.title(f'{continent}\n{min_date} to {max_date}')
            if save_figs == 1:
                plt.savefig(f'{output_dir_path}{continent}_map_{current_year}.png')
            plt.show()


### ASN Plots

In [None]:
# Figure out which ASNs have 5% of the data or more for that country and then generate pie chart. 
# All ASNs with less than 5% of the data are grouped into the 'Other' category for ASNs
if analyze_geolocation == 1:
    
    for country, timezone in countries_timezones.items():

        current_country_df = subset_table[subset_table['Country'] == country]
        current_country_df.loc[:,'Hour'] = (current_country_df['Packet_Timestamp'] + pd.Timedelta(hours=timezone)).dt.hour

        country_year_packet_counts = current_country_df.groupby(['Year', 'AS_Name', 'Hour'])['Packet_Timestamp'].count().reset_index()
        country_year_packet_counts = country_year_packet_counts.rename(columns={'Packet_Timestamp': 'Packet_Count', 
                                                                                'AS_Name':'AS Name'})

        for year in years_of_interest:

            current_year_df = country_year_packet_counts[country_year_packet_counts['Year'] == year]

            for asn in current_year_df['AS Name'].unique():
                current_asn_df_sum = sum(current_year_df.loc[current_year_df['AS Name'] == asn, 'Packet_Count'])
                if current_asn_df_sum < 0.05 * len(current_year_df):    
                    current_year_df.loc[current_year_df['AS Name'] == asn, 'AS Name'] = 'Other'


            asn_packet_counts = current_year_df.groupby(['AS Name'])['Packet_Count'].sum().reset_index()
            asn_packet_counts = asn_packet_counts.set_index('AS Name')

            fig, ax = plt.subplots() 
            title = plt.title(f'{year} ASN Breakdown in {country}')
            title.set_ha("left")
            plt.gca().axis("equal")
            pie = plt.pie(asn_packet_counts['Packet_Count'], startangle=0)
            labels = [ '\n'.join(wrap(l, 40)) for l in asn_packet_counts.index]
            plt.legend(pie[0],labels, bbox_to_anchor=(1,0.5), loc="center right", fontsize=10, bbox_transform=plt.gcf().transFigure)
            plt.subplots_adjust(left=0.0, bottom=0.1, right=0.45)
            if save_figs == 1:
                plt.savefig(f'{output_dir_path}{country}_ASN_Breakdown_{year}.png')


### Hourly Plots
Counts of all the packets for the week by each hour

In [None]:
# Global Data

year_hour_packet_counts = orig_data.groupby(['Year', 'Hour'])['Packet_Timestamp'].count().reset_index()
year_hour_packet_counts = year_hour_packet_counts.rename(columns={'Packet_Timestamp': 'Packet_Count'})

rows = 2
cols = math.ceil(len(years_of_interest) / rows)

plt.figure()
fig, axs = plt.subplots(rows, cols, sharex = True, sharey=True, figsize=(12,6))
fig.suptitle(f'Hourly Breakdowns Worldwide')

for year in range(0, len(years_of_interest)):
    current_year = years_of_interest[year]
    min_date = orig_data.loc[orig_data['Year']==current_year, 'Data_Date'].min()
    max_date = orig_data.loc[orig_data['Year']==current_year, 'Data_Date'].max()

    hourly_packet_counts = year_hour_packet_counts[year_hour_packet_counts['Year'] == current_year]

    ax_row = math.floor(year/cols)
    ax_col = year%cols
    axs[ax_row, ax_col].bar(hourly_packet_counts['Hour'], hourly_packet_counts['Packet_Count'])
    axs[ax_row, ax_col].set_title(f'Hourly {min_date} to {max_date}')
    axs[ax_row, ax_col].grid()

for ax in axs.flat:
    ax.set(xlabel=f"Hour", ylabel="Packet count")

for ax in axs.flat:
    ax.label_outer()

if save_figs == 1:
    plt.savefig(f'{output_dir_path}Global_Hourly.png')


In [None]:
# By Country 

if analyze_geolocation == 1:
    
    for country, timezone in countries_timezones.items():

        current_country_df = subset_table[subset_table['Country'] == country]
        current_country_df.loc[:, 'Hour'] = (current_country_df['Packet_Timestamp'] + pd.Timedelta(hours=timezone)).dt.hour

        country_year_packet_counts = current_country_df.groupby(['Year', 'AS_Name', 'Hour'])['Packet_Timestamp'].count().reset_index()
        country_year_packet_counts = country_year_packet_counts.rename(columns={'Packet_Timestamp': 'Packet_Count', 
                                                                                'AS_Name':'AS Name'})

        rows = 2
        cols = math.ceil(len(years_of_interest) / rows)

        plt.figure()
        fig, axs = plt.subplots(rows, cols, sharex = True, sharey=True, figsize=(12,6))
        fig.suptitle(f'{country} Hourly Breakdowns')

        for year in range(0, len(years_of_interest)):
            current_year = years_of_interest[year]
            min_date = subset_table.loc[subset_table['Year']==current_year, 'Data_Date'].min()
            max_date = subset_table.loc[subset_table['Year']==current_year, 'Data_Date'].max()

            current_year_df = country_year_packet_counts[country_year_packet_counts['Year'] == current_year]
            hourly_packet_counts = current_year_df.groupby(['Hour'])['Packet_Count'].sum().reset_index()

            ax_row = math.floor(year/cols)
            ax_col = year%cols
            axs[ax_row, ax_col].bar(hourly_packet_counts['Hour'], hourly_packet_counts['Packet_Count'])
            axs[ax_row, ax_col].set_title(f'Hourly {min_date} to {max_date}')
            axs[ax_row, ax_col].grid()

        for ax in axs.flat:
            ax.set(xlabel=f"Hour (EST{'+' if timezone >= 0 else ''}{timezone})", ylabel="Packet count")

        for ax in axs.flat:
            ax.label_outer()

        if save_figs == 1:
            plt.savefig(f'{output_dir_path}{country}_Hourly.png')


### Weekly Plots
Counts of all the packets daily grouped by 3 hour intervals

In [None]:
# Global Data


current_df = orig_data

current_timestamps = list(current_df['Packet_Timestamp']) 
new_timestamps = [int((x.weekday() * 24 + x.hour) / 3) for x in current_timestamps]
current_df.loc[:, 'Hour'] = new_timestamps

year_hour_packet_counts = current_df.groupby(['Year', 'Hour'])['Packet_Timestamp'].count().reset_index()
year_hour_packet_counts = year_hour_packet_counts.rename(columns={'Packet_Timestamp': 'Packet_Count'})

rows = 2
cols = math.ceil(len(years_of_interest) / rows)

plt.figure()
fig, axs = plt.subplots(rows, cols, sharex = True, sharey=True, figsize=(12,6))
fig.suptitle(f'Weekly Breakdowns Worldwide')

for year in range(0, len(years_of_interest)):
    current_year = years_of_interest[year]
    min_date = orig_data.loc[orig_data['Year']==current_year, 'Data_Date'].min()
    max_date = orig_data.loc[orig_data['Year']==current_year, 'Data_Date'].max()

    hourly_packet_counts = year_hour_packet_counts[year_hour_packet_counts['Year'] == current_year]
#         hourly_packet_counts = current_year_df.groupby(['Hour'])['Packet_Count'].sum().reset_index()

    ax_row = math.floor(year/cols)
    ax_col = year%cols
    axs[ax_row, ax_col].bar(hourly_packet_counts['Hour'], hourly_packet_counts['Packet_Count'])
    axs[ax_row, ax_col].set_title(f'Weekly {min_date} to {max_date}')
    axs[ax_row, ax_col].grid()
    axs[ax_row, ax_col].tick_params(labelrotation=30)        

plt.setp(axs, xticks=list(range(4, 7*8 +1, 8)), xticklabels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])


for ax in axs.flat:
    ax.set(xlabel=f"Weekday", ylabel="Packet count")

for ax in axs.flat:
    ax.label_outer()

if save_figs == 1:
    plt.savefig(f'{output_dir_path}Global_Weekly.png')


In [None]:
# By Country

if analyze_geolocation == 1:

    for country, timezone in countries_timezones.items():

        current_country_df = subset_table[subset_table['Country'] == country]

        current_timestamps = list(current_country_df['Packet_Timestamp']) 
        new_timestamps = [int((x.weekday() * 24 + x.hour) / 3) for x in current_timestamps]
        current_country_df.loc[:, 'Hour'] = new_timestamps

        country_year_packet_counts = current_country_df.groupby(['Year', 'AS_Name', 'Hour'])['Packet_Timestamp'].count().reset_index()
        country_year_packet_counts = country_year_packet_counts.rename(columns={'Packet_Timestamp': 'Packet_Count', 
                                                                                'AS_Name':'AS Name'})

        rows = 2
        cols = math.ceil(len(years_of_interest) / rows)

        plt.figure()
        fig, axs = plt.subplots(rows, cols, sharex = True, sharey=True, figsize=(12,6))
        fig.suptitle(f'{country} Weekly Breakdowns')

        for year in range(0, len(years_of_interest)):
            current_year = years_of_interest[year]
            min_date = subset_table.loc[subset_table['Year']==current_year, 'Data_Date'].min()
            max_date = subset_table.loc[subset_table['Year']==current_year, 'Data_Date'].max()

            current_year_df = country_year_packet_counts[country_year_packet_counts['Year'] == current_year]
            hourly_packet_counts = current_year_df.groupby(['Hour'])['Packet_Count'].sum().reset_index()

            ax_row = math.floor(year/cols)
            ax_col = year%cols
            axs[ax_row, ax_col].bar(hourly_packet_counts['Hour'], hourly_packet_counts['Packet_Count'])
            axs[ax_row, ax_col].set_title(f'Weekly {min_date} to {max_date}')
            axs[ax_row, ax_col].grid()
            axs[ax_row, ax_col].tick_params(labelrotation=30)        

        plt.setp(axs, xticks=list(range(4, 7*8 +1, 8)), xticklabels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])


        for ax in axs.flat:
            ax.set(xlabel=f"Weekday", ylabel="Packet count")

        for ax in axs.flat:
            ax.label_outer()

        if save_figs == 1:
            plt.savefig(f'{output_dir_path}{country}_Weekly.png')
