In [1]:
import os
import re
import json

In [2]:
directory_path = 'C:/Users/spdlq/Documents/my_code/ttp_autogpt/python_ground/2022_text'
output_path = 'C:/Users/spdlq/Documents/my_code/ttp_autogpt/python_ground/output'

In [3]:
def extract_references(content):
    reference_name_pattern = re.compile(r'CVE-\d{4}-\d{4,5}', re.IGNORECASE)
    return [match.upper() for match in re.findall(reference_name_pattern, content)]

def extract_filenames(content):
    filename_pattern = re.compile(r'\b\w+\.[a-zA-Z]{2,4}\b')
    return re.findall(filename_pattern, content)

def extract_hashes(content):
    md5_pattern = re.compile(r'\b[0-9a-fA-F]{32}\b')
    sha1_pattern = re.compile(r'\b[0-9a-fA-F]{40}\b')
    sha256_pattern = re.compile(r'\b[0-9a-fA-F]{64}\b')
    return {
        'MD5_Hashes': re.findall(md5_pattern, content),
        'SHA1_Hashes': re.findall(sha1_pattern, content),
        'SHA256_Hashes': re.findall(sha256_pattern, content)
    }

def extract_registry_entries(content):
    registry_pattern = re.compile(r'HK[A-Z_]+\\[^"]+')
    return re.findall(registry_pattern, content)

def extract_urls(content):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return re.findall(url_pattern, content)

def categorize_content(content, rule_id, filename):
    return {
        'rule_id': rule_id,
        'name': filename[:-4],
        'description': "-",
        'references': extract_references(content),
        'File_Names': extract_filenames(content),
        **extract_hashes(content),
        'Registry_Entries': extract_registry_entries(content),
        'URLs': extract_urls(content),
        'weight': 0.0
    }

In [4]:
def get_text_files_from_directory(directory_path):
    """Returns a list of .txt files from the specified directory."""
    return [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.txt')]

def process_text_file(file_path, unique_sets, rule_id):
    """Processes a single text file and returns the categorized content."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        filename = os.path.basename(file_path)
        categorized_content = categorize_content(content, rule_id, filename)
        categorized_content['references'] = list(set([x for x in categorized_content['references'] if x not in unique_sets['references']]))
        unique_sets['references'].update(categorized_content['references'])
        return categorized_content

def write_to_json(output_path, content):
    """Writes the provided content to a JSON file."""
    with open(output_path, "w") as outfile:
        json.dump(content, outfile)

In [5]:
whitelist_urls = [
    "www.trendmicro.com",
    "attack.mitre.org",
    "documents.trendmicro.com",
    "www.ithome.com.tw",
    "hitcon.org"
]

In [6]:
def process_text_file(file_path, unique_sets, rule_id):
    """Processes a single text file and returns the categorized content."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        filename = os.path.basename(file_path)
        categorized_content = categorize_content(content, rule_id, filename)

        # Ensure references are unique across all files
        categorized_content['references'] = list(set([x for x in categorized_content['references'] if x not in unique_sets['references']]))
        unique_sets['references'].update(categorized_content['references'])

        # Ensure filenames are unique across all files
        categorized_content['File_Names'] = list(set([x for x in categorized_content['File_Names'] if x not in unique_sets['File_Names']]))
        unique_sets['File_Names'].update(categorized_content['File_Names'])

        # Ensure hashes are unique across all files
        for hash_type in ['MD5_Hashes', 'SHA1_Hashes', 'SHA256_Hashes']:
            categorized_content[hash_type] = list(set([x for x in categorized_content[hash_type] if x not in unique_sets[hash_type]]))
            unique_sets[hash_type].update(categorized_content[hash_type])

        # Ensure registry entries are unique across all files
        categorized_content['Registry_Entries'] = list(set([x for x in categorized_content['Registry_Entries'] if x not in unique_sets['Registry_Entries']]))
        unique_sets['Registry_Entries'].update(categorized_content['Registry_Entries'])

        # Ensure URLs are unique across all files
        categorized_content['URLs'] = list(set([x for x in categorized_content['URLs'] if x not in unique_sets['URLs'] and not any(whitelisted in x for whitelisted in whitelist_urls)]))
        unique_sets['URLs'].update(categorized_content['URLs'])

        return categorized_content

# Main processing function with adjustments for unique extraction

def process_all_text_files(directory_path, output_path):
    text_files = get_text_files_from_directory(directory_path)
    unique_sets = {
        'references': set(),
        'File_Names': set(),
        'MD5_Hashes': set(),
        'SHA1_Hashes': set(),
        'SHA256_Hashes': set(),
        'Registry_Entries': set(),
        'URLs': set()
    }
    rule_id = 0
    for file_path in text_files:
        rule_id += 1
        categorized_content = process_text_file(file_path, unique_sets, rule_id)
        filename_without_extension = os.path.splitext(os.path.basename(file_path))[0]
        output_json_path = os.path.join(output_path, filename_without_extension + ".json")
        write_to_json(output_json_path, categorized_content)

In [7]:
process_all_text_files(directory_path, output_path)
