In [1]:
import json
from collections import defaultdict
from datetime import datetime, timedelta
import csv
import re
import hashlib

In [3]:
#FUNZIONI PER DARKTRACEROW

def get_keys_from_record(record):
    return set(record.keys())

def count_record(path):
    n_record=0
    with open(path, 'r') as file:
        for line in file:
            n_record += 1
    print(n_record)

def count_attack(path):
    n_record=0
    with open(path, 'r') as file:
        for line in file:
            record = json.loads(line.strip())
            if record['label']==1:
                n_record += 1
    print(n_record)


def label_and_filter(path, new_path): #etichetta i record e filtra quelli richiesta/risposta
    new_dataset = []
    with open(path, 'r') as file:
        for line in file:
            record = json.loads(line.strip())
            if record['source_ip']=='1ccdb898890cce841210e3fb0bcc3e7974f069ca89da96625e7b7699bf277165' and record['dest_ip']=='f4c36b35451f863e37f34989cca218a6e7c40d22f699aafeef3a6d7ae76a75a2':
                record['label']=1
            else:
                record['label']=0
            new_dataset.append(record)

    with open(new_path, 'w') as file:
        for record in new_dataset:
            file.write(json.dumps(record) + '\n')

def common_keys(path): #stampa le chiavi che sono comuni a tutti i record e il numero di occorrenze delle chiavi non comuni
    common_keys = None
    all_keys = set()
    key_occurrences = defaultdict(int)

    with open(path, 'r') as file:
        for line in file:
            record = json.loads(line.strip())
            record_keys = get_keys_from_record(record)
            all_keys.update(record_keys)
            for key in record_keys:
                key_occurrences[key] += 1
            if common_keys is None:
                common_keys = record_keys
            else:
                common_keys &= record_keys

    non_common_keys = all_keys - common_keys

    non_common_key_counts = {key: key_occurrences[key] for key in non_common_keys}

    sorted_non_common_key_counts = dict(sorted(non_common_key_counts.items(), key=lambda item: item[1], reverse=True))

    print("Le chiavi comuni sono:", common_keys)
    print("Le chiavi non comuni sono:", non_common_keys)
    print("Numero di occorrenze delle chiavi non comuni:", sorted_non_common_key_counts)

In [4]:
new_path='../../../../enea/dati_simulazione/ssh_04_10_label.json'
path='../../../../enea/dati_simulazione/ssh_04_10.json'
count_record(path)

119278


In [19]:
count_attack(new_path)

3633


In [4]:
common_keys(path)

Le chiavi comuni sono: {'epochdate', '@host', '@type', 'uid', 'dest_ip', 'source_port', 'source_ip', '@timestamp', 'status_guess', 'dest_port'}
Le chiavi non comuni sono: {'direction', 'remote_location_latitude', 'host_key_alg', 'server', 'cipher_alg', 'mac_alg', 'host_key', 'remote_location_longitude', 'client', 'auth_attempts', 'auth_success', 'compression_alg', 'remote_location_country_code', 'version', 'kex_alg'}
Numero di occorrenze delle chiavi non comuni: {'server': 108651, 'client': 102802, 'version': 102802, 'host_key_alg': 96060, 'cipher_alg': 96060, 'mac_alg': 96060, 'compression_alg': 96060, 'kex_alg': 96060, 'host_key': 91764, 'direction': 84533, 'auth_attempts': 40638, 'auth_success': 40638, 'remote_location_latitude': 19338, 'remote_location_longitude': 19338, 'remote_location_country_code': 19338}


In [11]:
label_and_filter(path, new_path)

3633


In [15]:
#FUNZIONI PER DARKTRACEAI

def get_keys_from_record_ai(record, parent_key=''):
    keys = set()
    if isinstance(record, dict):
        for k, v in record.items():
            new_key = f"{parent_key}.{k}" if parent_key else k
            keys.add(new_key)
            keys.update(get_keys_from_record_ai(v, new_key))
    elif isinstance(record, list):
        for i, item in enumerate(record):
            new_key = f"{parent_key}"
            keys.update(get_keys_from_record_ai(item, new_key))
    return keys

def find_all_ips(data, exclude_ips):# Funzione per trovare gli indirizzi IP nel JSON senza includere combinazioni IP:Porta e quelli in record["related"]["ip"]
    ip_set = set()
    ip_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')

    def extract_ips(value):
        if isinstance(value, dict):
            for k, v in value.items():
                extract_ips(v)
        elif isinstance(value, list):
            for item in value:
                extract_ips(item)
        elif isinstance(value, str):
            if ip_pattern.fullmatch(value):# Verifica se il valore è esattamente un indirizzo IP e non parte di un'altra stringa
                ip_set.add(value)

    extract_ips(data)
    ip_set -= exclude_ips  # Rimuove gli IP da escludere
    return ip_set

def read_json_file(file_path): # Funzione per leggere un file JSON riga per riga
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():
                try:
                    record = json.loads(line.strip())
                    data.append(record)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON on line: {line.strip()} - {e}")
    return data

def write_json_file(data, file_path): # Funzione per scrivere un file JSON riga per riga
    with open(file_path, 'w') as file:
        for record in data:
            json.dump(record, file)
            file.write('\n')

def add_ip_to_record(source_path, dest_path):
    data = read_json_file(source_path) # Leggere il file originale
   
    for record in data: # Aggiornare i record con i nuovi IP
        exclude_ips = set(record.get("related", {}).get("ip", []))
        ips = find_all_ips(record, exclude_ips)
        for ip in ips:
            print(f"Nuovo IP trovato: {ip}")
            record["related"]["ip"].append(ip)
            
    write_json_file(data, dest_path) # Scrivere il nuovo file
    print(f"Nuovo file JSON creato: {dest_path}")


def calculate_sha256(data):
    if isinstance(data, str):
        data = data.encode()
    sha256_hash = hashlib.sha256(data).hexdigest()
    return sha256_hash

def find_with_dest_ip(timestamp_start,timestamp_end,source_ip,dest_ip,darktrace_row,corresponding_records):
    for row_index, row_record in enumerate(darktrace_row):                    
        if ((row_record.get('@timestamp') >= timestamp_start and
            row_record.get('@timestamp') <= timestamp_end) and 
            row_record.get('source_ip') == calculate_sha256(source_ip) and 
            row_record.get('dest_ip') == calculate_sha256(dest_ip)):
            corresponding_records.append(row_record)
            row_record['label'] = 1  
        else:
            if 'label' not in row_record:
                row_record['label'] = 0 
    return corresponding_records

def find_without_dest_ip(timestamp_start,timestamp_end,source_ip,darktrace_row,corresponding_records):
    for row_index, row_record in enumerate(darktrace_row):                    
        if ((row_record.get('@timestamp') >= timestamp_start and
            row_record.get('@timestamp') <= timestamp_end) and 
            row_record.get('source_ip') == calculate_sha256(source_ip)):
            corresponding_records.append(row_record)
            row_record['label'] = 1  
        else:
            if 'label' not in row_record:
                row_record['label'] = 0  

    return corresponding_records


def create_dataset(file,path):
    with open(path, 'w') as outfile:
        for record in file:
            outfile.write(json.dumps(record)+'\n')              
   

def find_corresponding_raw_record(darktrace_ai_analyst, darktrace_row, new_path):
    corresponding_records = {}

    for index, record in enumerate(darktrace_ai_analyst):
        source_ip = record["related"]["ip"][0]
        timestamp_start = record["event"]["start"][0][:-5]
        timestamp_end = record["event"]["end"][0][:-5]

        corresponding_records[index] = []
        dest_ip = None
        if len(record["related"]["ip"]) > 1:
            for ip in record["related"]["ip"][1:]:
                dest_ip = ip
                result=[]
                result=find_with_dest_ip(timestamp_start,timestamp_end,source_ip,dest_ip,darktrace_row, result)
                if result:
                    for r in result:           
                        corresponding_records[index].append(r)
        else:
            result=[]
            result=find_without_dest_ip(timestamp_start,timestamp_end,source_ip,darktrace_row,result)
            if result:
                for r in result:
                    corresponding_records[index]=r
        
        if not corresponding_records[index]:
            corresponding_records.pop(index)

    for index, record in corresponding_records.items():
        print(f"Record di darktrace_ai analizzato {index}:")
        print("  Analizzato:", darktrace_ai_analyst[index])
        print("  darktrace_row corrispondente:", record)
        print("  numero di darktrace_row corrispondente:", len(record))
        print()
    
    create_dataset(darktrace_row, new_path)

    return corresponding_records

def create_record(path):
    data=[]
    with open(path, 'r') as file:
        for line in file:
            record = json.loads(line.strip())
            data.append(record)
    return data


In [32]:
path='../../../../enea/dati_simulazione/train_ssh/02_10_ai.json'
new_path='../../../../enea/dati_simulazione/train_ssh/02_10_ai_con_ip.json'
#path='../../../../enea/dati_simulazione/04_10_ai.json'
#new_path='../../../../enea/dati_simulazione/04_10_ai_con_ip.json'
data=read_json_file(path)
add_ip_to_record(path,new_path)

Nuovo IP trovato: 13.107.138.10
Nuovo IP trovato: 69.30.89.30
Nuovo IP trovato: 89.44.168.16
Nuovo IP trovato: 89.44.168.58
Nuovo IP trovato: 185.206.26.132
Nuovo IP trovato: 185.206.26.17
Nuovo IP trovato: 94.24.37.43
Nuovo IP trovato: 89.44.168.196
Nuovo IP trovato: 89.44.168.212
Nuovo IP trovato: 185.206.24.31
Nuovo IP trovato: 89.44.168.221
Nuovo IP trovato: 89.44.168.240
Nuovo IP trovato: 89.44.168.242
Nuovo IP trovato: 89.44.168.229
Nuovo IP trovato: 185.206.24.30
Nuovo IP trovato: 89.44.168.213
Nuovo IP trovato: 188.79.136.14
Nuovo IP trovato: 79.18.24.152
Nuovo IP trovato: 95.252.72.189
Nuovo IP trovato: 138.22.179.152
Nuovo IP trovato: 198.244.186.241
Nuovo IP trovato: 198.244.186.243
Nuovo IP trovato: 141.95.49.174
Nuovo IP trovato: 93.33.57.83
Nuovo IP trovato: 93.36.119.216
Nuovo IP trovato: 93.40.53.41
Nuovo IP trovato: 141.95.49.174
Nuovo IP trovato: 57.128.133.58
Nuovo IP trovato: 37.134.57.241
Nuovo IP trovato: 79.49.184.106
Nuovo IP trovato: 95.252.72.189
Nuovo IP trov

In [33]:
row_path='../../../../enea/dati_simulazione/train_ssh/ssh_02_10.json'
#row_path='../../../../enea/dati_simulazione/ssh_04_10_label.json'
darktrace_ai_analyst=create_record(new_path)
darktrace_row=create_record(row_path)
corresponding_records = find_corresponding_raw_record(darktrace_ai_analyst, darktrace_row,row_path)


In [34]:
count_attack(row_path)

0
