In [None]:
import json
import pandas as pd
import os
import datetime

# CONFIG
INPUT_FILE = "/content/drive/MyDrive/eve.json-202506030000" # Updated input file
OUTPUT_DIR = "/content/drive/MyDrive/processed_chunks_by_time_json" # New output directory for JSON
# TIME_CHUNK_SIZE = "1H"  # This is now just for clarity, the logic will align with hour boundaries

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_features(log_entry):
    # Customize this based on your log structure
    column_names = ['timestamp', 'flow_id', 'src_ip', 'src_port',
       'dest_ip', 'dest_port', 'proto', 'event_type', 'flow']

    features = {}
    for col in column_names:
      features[col] = log_entry.get(col)
    return features

def process_large_json_by_time(input_file):
    chunk = []
    current_chunk_hour = None
    chunk_count = 0

    with open(input_file, 'r') as f:
        for i, line in enumerate(f):
            try:
                log_entry = json.loads(line)

                timestamp_str = log_entry.get('timestamp')  # Assuming 'timestamp' field exists
                event_type = log_entry.get('event_type') # Get the event type

                # Parse the timestamp string into a datetime object
                timestamp = datetime.datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S.%f%z')

                # Determine the hour of the current timestamp
                current_hour = timestamp.replace(minute=0, second=0, microsecond=0)

                # Check if a new chunk should be started
                if current_chunk_hour is None:
                    current_chunk_hour = current_hour
                elif current_hour > current_chunk_hour:
                    # Save the current chunk
                    output_filename = f"{OUTPUT_DIR}/chunk_{chunk_count}_{current_chunk_hour.strftime('%Y%m%d%H%M%S')}.json"
                    with open(output_filename, 'w') as outfile:
                        json.dump(chunk, outfile, indent=2) # Use indent for readability
                    print(f"Saved {output_filename}")

                    # Start a new chunk
                    chunk = []
                    chunk_count += 1
                    current_chunk_hour = current_hour

                # selected_event_types = ['alert', 'flow', 'anomaly']

                # Add the log entry to the current chunk only if the event type is selected
                # if event_type in selected_event_types:
                features = extract_features(log_entry)
                chunk.append(features)

            except (json.JSONDecodeError, KeyError, ValueError) as e:
                print(f"Error processing line {i+1}: {e}")
                continue  # Skip malformed or missing timestamp lines

        # Save any remaining logs as JSON
        if chunk:
            output_filename = f"{OUTPUT_DIR}/chunk_{chunk_count}_{current_chunk_hour.strftime('%Y%m%d%H%M%S')}.json"
            with open(output_filename, 'w') as outfile:
                json.dump(chunk, outfile, indent=2)
            print(f"Saved final {output_filename}")

if __name__ == "__main__":
    # Pass only the input file to the function
    process_large_json_by_time(INPUT_FILE)

Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_0_20250601160000.json
Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_1_20250601170000.json
Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_2_20250601180000.json
Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_3_20250601190000.json
Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_4_20250601200000.json
Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_5_20250601210000.json
Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_6_20250601220000.json
Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_7_20250601230000.json
Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_8_20250602000000.json
Error processing line 118635984: Expecting ':' delimiter: line 1 column 97 (char 96)
Saved /content/drive/MyDrive/processed_chunks_by_time_json/chunk_9_20250602010000.json
Saved /content/drive/MyDrive/processed_chunks

OSError: [Errno 28] No space left on device

In [None]:
import shutil
from google.colab import files
shutil.make_archive("/content/processed_chunks_by_time_json", 'zip', "processed_chunks_by_time_json")
files.download('/content/processed_chunks_by_time_json.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>