In [5]:
# Run this cell to set up the notebook, but please don't change it.

# These lines import the numpy and datascience modules.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datascience import *
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [6]:
# Function to parse a line (in JSON format) and extract into N fields (specified in the array) in csv format
def parse_line(line, fields_to_extract):
    try:
        data = json.loads(line)
        parsed_fields = []
        
        for field in fields_to_extract:
            # Get the field value or empty string if not present
            value = data.get(field, '')
            # Convert to string and replace any commas to avoid CSV issues
            parsed_fields.append(str(value).replace(',', ';'))
        
        # Join the fields with commas
        return ','.join(parsed_fields)
    except json.JSONDecodeError:
        return None  # Return None for invalid JSON lines

    
    
# Function definition to Read the file and write parsed lines to output file
def file_convert_csv(input_file, output_file, fields_list):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        outfile.write(','.join(fields_list) + '\n')
        for line in infile:
            parsed_line = parse_line(line.strip(), fields_to_extract)
            if parsed_line is not None:  # Only write valid parsed lines
                outfile.write(parsed_line + '\n')

    print(f"Parsing complete. Output written to {output_file}")
    
    

# Step 1: datafile download and format converting

Step 1 - download the bz2 file from the website, extract into text file, and convert the json format data file into csv format
download link (folder) - 
https://csr.lanl.gov/data-fence/1742781702/j2hoTmkCmL2Rr_G6xYPRIxDEHj0=/unified-host-network-dataset-2017/wls/


In [7]:

# modify from Sean's Sample code and download the dialy *.bz2 host data file

%pip install gdown
import gdown
import pandas as pd
import dask.dataframe as dd

# specify the number of day in two-digit
no_Day = "01"

URL = f"https://csr.lanl.gov/data-fence/1742781702/j2hoTmkCmL2Rr_G6xYPRIxDEHj0=/unified-host-network-dataset-2017/wls/wls_day-{no_Day}.bz2"

gdown.download(URL, quiet = False)


Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting filelock (from gdown)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Using cached gdown-5.2.0-py3-none-any.whl (18 kB)
Using cached filelock-3.18.0-py3-none-any.whl (16 kB)
Installing collected packages: filelock, gdown
Successfully installed filelock-3.18.0 gdown-5.2.0
Note: you may need to restart the kernel to use updated packages.


Downloading...
From: https://csr.lanl.gov/data-fence/1742781702/j2hoTmkCmL2Rr_G6xYPRIxDEHj0=/unified-host-network-dataset-2017/wls/wls_day-01.bz2
To: /home/jovyan/UEBAProject/DataProcess/host/Day2/wls_day-01.bz2
100%|██████████| 409M/409M [02:57<00:00, 2.30MB/s] 


'wls_day-01.bz2'

In [8]:
#Extract the bz2 file

import bz2
import shutil

def extract_bz2(bz2_filepath, output_filepath):
    """Extracts a .bz2 file.

    Args:
        bz2_filepath: Path to the .bz2 file.
        output_filepath: Path to save the extracted file.
    """
    try:
        with bz2.BZ2File(bz2_filepath, 'rb') as compressed_file:
            with open(output_filepath, 'wb') as uncompressed_file:
                shutil.copyfileobj(compressed_file, uncompressed_file)
        print(f"Successfully extracted '{bz2_filepath}' to '{output_filepath}'")
    except FileNotFoundError:
         print(f"Error: The file '{bz2_filepath}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
compressed_file_path = f"wls_day-{no_Day}.bz2"
extracted_file_path = f"wls_day-{no_Day}"
extract_bz2(compressed_file_path, extracted_file_path)

Successfully extracted 'wls_day-01.bz2' to 'wls_day-01'


# Step 2: Data loading into a table

Define the fields we want to extract (total of 21 fields in order)
According to the document - https://csr.lanl.gov/data/2017/#citing
Need to review the content of the file to see if some attributes should be ignored.
   1. 'Time',
   2. 'EventID',
   3. 'LogHost',
   4. 'LogonType',
   5. 'LogonTypeDescription',
   6. 'UserName',
   7. 'DomainName',
   8. 'LogonID',
   9. 'SubjectUserName',
  10. 'SubjectDomainName',
  11. 'SubjectLogonID',
  12. 'Status',
  13. 'Source',
  14. 'ServiceName',
  15. 'Destination',
  16. 'AuthenticationPackage',
  17. 'FailureReason',
  18. 'ProcessName',
  19. 'ProcessID',
  20. 'ParentProcessName',
  21. 'ParentProcessID'


**** Due to process limitation affected by the size of datafile. Some unuse columns are not included in the csv file****

Step 2 - load the data file into a table if necessary clean the incorrect format data

1. UserName
2. EventID
3. LogHost
4. LogonID
5. DomainName
6. ParentProcessName
7. ParentProcessID
8. ProcessName
9. Time
10. ProcessID


In [3]:
no_Day = "01"


In [9]:
import json

# Define the fields we want to extract (total of 10 fields)
fields_to_extract = [
    'UserName',
    'EventID',
    'LogHost',
    'LogonID',
    'DomainName',
    'ParentProcessName',
    'ParentProcessID',
    'ProcessName',
    'Time',
    'ProcessID'
]

"""

# Define the fields we want to extract (total of 21 fields in order)
# According to the document - https://csr.lanl.gov/data/2017/#citing
fields_to_extract = [
    'Time',
    'EventID',
    'LogHost',
    'LogonType',
    'LogonTypeDescription',
    'UserName',
    'DomainName',
    'LogonID',
    'SubjectUserName',
    'SubjectDomainName',
    'SubjectLogonID',
    'Status',
    'Source',
    'ServiceName',
    'Destination',
    'AuthenticationPackage',
    'FailureReason',
    'ProcessName',
    'ProcessID',
    'ParentProcessName',
    'ParentProcessID'
]
"""

# Input and output file paths
input_file = f"wls_day-{no_Day}"
output_file = f"wls_day-{no_Day}.csv"

# Call the function to convert the data file into a text file (*.csv) with the array of the data field
file_convert_csv(input_file, output_file, fields_to_extract)

Parsing complete. Output written to wls_day-01.csv


In [3]:
no_Day = '01'
output_file = f"wls_day-{no_Day}.csv"

# divide the datafile into three parts

In [4]:
# CSV is very large to avoid loading all rows into memory - 
# divide the large csv file into three parts sequentially

import csv
import math

def split_csv_large(input_file, output_prefix, num_files=6):
    # First pass to count rows (excluding header)
    with open(input_file, 'r') as infile:
        row_count = sum(1 for row in infile) - 1
    
    rows_per_file = math.ceil(row_count / num_files)
    
    # Second pass to split the file
    with open(input_file, 'r') as infile:
        reader = csv.reader(infile)
        header = next(reader)
        
        file_index = 1
        current_row = 0
        outfile = None
        
        for row in reader:
            if current_row % rows_per_file == 0:
                if outfile is not None:
                    outfile.close()
                outfile = open(f"{output_prefix}_{file_index}.csv", 'w', newline='')
                writer = csv.writer(outfile)
                writer.writerow(header)
                file_index += 1
            writer.writerow(row)
            current_row += 1
        
        if outfile is not None:
            outfile.close()

# Example usage:
split_csv_large(output_file, f"{no_Day}S_Part")

In [12]:
# approximately equal-sized files with random distribution


import csv
import random

def balanced_random_split(input_file, output_prefix, num_files=3, random_seed=None):
    if random_seed is not None:
        random.seed(random_seed)
    
    # First pass to count rows (excluding header)
    with open(input_file, 'r') as infile:
        reader = csv.reader(infile)
        header = next(reader)
        row_count = sum(1 for row in infile) - 1
        print(f"total number of lines - {row_count}")
    
    # Generate random assignments with balanced counts
    assignments = [i % num_files for i in range(row_count)]
    random.shuffle(assignments)
    
    # Initialize output files
    output_files = [open(f"{output_prefix}{i+1}.csv", 'w', newline='') for i in range(num_files)]
    writers = [csv.writer(f) for f in output_files]
    
    # Write headers
    for writer in writers:
        writer.writerow(header)
    
    # Process file and write rows
    with open(input_file, 'r') as infile:
        reader = csv.reader(infile)
        next(reader)  # skip header
        
        for i, row in enumerate(reader):
            writers[assignments[i]].writerow(row)
    
    # Close files
    for f in output_files:
        f.close()

# Example usage:
balanced_random_split(output_file, f"{no_Day}R_part", num_files=3)

total number of lines - 55609150


IndexError: list index out of range

# File access/download from Google Drive


In [None]:
# Test to access the data file on Google Drive
# Define the file path and the number of lines to print

import pandas as pd

# The link for the wls_day-02_001.csv (4.3MB) file - 
# https://drive.google.com/file/d/1t05t6Sn1JrvVfiLOF8fZwZFumzBh0gLI/view?usp=sharing

''' 
FILE_ID = "1t05t6Sn1JrvVfiLOF8fZwZFumzBh0gLI"

URL = f"https://drive.google.com/uc?id={FILE_ID}"

df = pd.read_csv(URL)
print(df.head())

wls_day2_001_url = Table().read_table(URL)
wls_day2_001_url.show(10)

'''

In [None]:
%pip install gdown
import gdown
import pandas as pd
import dask.dataframe as dd

FILE_ID = "1UjFNwQltfnvAqCheNAAJ8H_erm0c06Tb"
URL = f"https://drive.google.com/uc?id={FILE_ID}"

output_file = "large_file.csv"
gdown.download(URL, output_file, quiet=False)

df = dd.read_csv(output_file)
df.head() 

In [None]:
# Divide the original datafile randomly
# Can't perform due to the memory limitation

import numpy as np
import pandas as pd

def split_csv_into_three(input_file, fileNameHead, output_prefix='output'):
    # Read the original CSV file
    df = pd.read_csv(input_file)
    
    # Shuffle the DataFrame rows
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Calculate split points (approximately 1/3 and 2/3 of the data)
    split1 = len(df) // 3
    split2 = 2 * split1
    
    # Split into three parts
    df1 = df.iloc[:split1]
    df2 = df.iloc[split1:split2]
    df3 = df.iloc[split2:]
    
    # Save to three separate files
    df1.to_csv(f'{fileNameHead}{output_prefix}1.csv', index=False)
    df2.to_csv(f'{fileNameHead}{output_prefix}2.csv', index=False)
    df3.to_csv(f'{fileNameHead}{output_prefix}3.csv', index=False)
    
    print(f"Successfully split {input_file} into three files:")
    print(f"- {fileNameHead}{output_prefix}1.csv ({len(df1)} rows)")
    print(f"- {fileNameHead}{output_prefix}2.csv ({len(df2)} rows)")
    print(f"- {fileNameHead}{output_prefix}3.csv ({len(df3)} rows)")

# Usage example

# Input and output file paths
input_file = 'wls_day-01.csv'

#output_file1 = 'wls_day-02_Part1.csv'  
#output_file2 = 'wls_day-02_Part2.csv'
#output_file3 = 'wls_day-02_Part3.csv'

split_csv_into_three(input_file,'wls_day-01', 'Part')

In [None]:
# Divide the datafile into three parts sequentially
# Fail to perform on the cloud as well but can be complete at Laura's Mac

import pandas as pd

# Read the original CSV file
df = pd.read_csv('wls_day-01.csv')

# Calculate the size of each part
part_size = len(df) // 3

# Split into three parts
part1 = df.iloc[:part_size]
part2 = df.iloc[part_size:2*part_size]
part3 = df.iloc[2*part_size:]

# Save to new CSV files
part1.to_csv('wls_day-01part1.csv', index=False)
part2.to_csv('wls_day-01part2.csv', index=False)
part3.to_csv('wls_day-01part3.csv', index=False)

In [None]:
# get info using pandas
# still failed
import pandas as pd

df = pd.read_csv('wls_day-01.csv')
print(df.shape)
print(df.describe())
print(df.dtypes)
print(df.head())
print(df.isnull().sum())

In [None]:
# get info using dataScience package
wls_day1 = Table().read_table('wls_day-01_10.csv')
print(wls_day1.num_rows)
wls_day1.show()
