In [1]:
import os
import re
import json

In [2]:
def categorize_content(content):
    reference_name_pattern = re.compile(r'\bCVC[^\s]*\b')
    reference_name = re.findall(reference_name_pattern, content)
    filename_pattern = re.compile(r'\b\w+\.\w+\b')
    filenames = re.findall(filename_pattern, content)
    md5_pattern = re.compile(r'\b[0-9a-fA-F]{32}\b')
    sha1_pattern = re.compile(r'\b[0-9a-fA-F]{40}\b')
    sha256_pattern = re.compile(r'\b[0-9a-fA-F]{64}\b')
    md5_hashes = re.findall(md5_pattern, content)
    sha1_hashes = re.findall(sha1_pattern, content)
    sha256_hashes = re.findall(sha256_pattern, content)
    registry_pattern = re.compile(r'HK[A-Z_]+\\[^"]+')
    registry_entries = re.findall(registry_pattern, content)
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    urls = re.findall(url_pattern, content)

    return {
        'Reference_Name': reference_name,
        'File_Names': filenames,
        'MD5_Hashes': md5_hashes,
        'SHA1_Hashes': sha1_hashes,
        'SHA256_Hashes': sha256_hashes,
        'Registry_Entries': registry_entries,
        'URLs': urls
    }

In [None]:
# Create sets to track unique items across all files
unique_reference_names = set()
unique_filenames = set()
unique_md5_hashes = set()
unique_sha1_hashes = set()
unique_sha256_hashes = set()
unique_registry_entries = set()
unique_urls = set()

In [3]:
# Replace 'YOUR_DIRECTORY_PATH' with the path to the directory where your .txt files are located
directory_path = '/Users/dong-ju/Documents/My_code/ttp_autogpt/files'

# Generate a list of file paths for all the .txt files in that directory
text_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.txt')]


# Dictionary to store categorized content for all files
all_files_categorized_content = {}

for file_path in text_files:
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        categorized_content = categorize_content(content)
        filename = os.path.basename(file_path)
        formatted_filename = 'TTP_files: ' + filename[:-4]
        
        # Check for and remove duplicates across all files
        categorized_content['Reference_Name'] = [x for x in categorized_content['Reference_Name'] if x not in unique_reference_names]
        unique_reference_names.update(categorized_content['Reference_Name'])
        
        categorized_content['File_Names'] = [x for x in categorized_content['File_Names'] if x not in unique_filenames]
        unique_filenames.update(categorized_content['File_Names'])
        
        categorized_content['MD5_Hashes'] = [x for x in categorized_content['MD5_Hashes'] if x not in unique_md5_hashes]
        unique_md5_hashes.update(categorized_content['MD5_Hashes'])
        
        categorized_content['SHA1_Hashes'] = [x for x in categorized_content['SHA1_Hashes'] if x not in unique_sha1_hashes]
        unique_sha1_hashes.update(categorized_content['SHA1_Hashes'])
        
        categorized_content['SHA256_Hashes'] = [x for x in categorized_content['SHA256_Hashes'] if x not in unique_sha256_hashes]
        unique_sha256_hashes.update(categorized_content['SHA256_Hashes'])
        
        categorized_content['Registry_Entries'] = [x for x in categorized_content['Registry_Entries'] if x not in unique_registry_entries]
        unique_registry_entries.update(categorized_content['Registry_Entries'])
        
        categorized_content['URLs'] = [x for x in categorized_content['URLs'] if x not in unique_urls]
        unique_urls.update(categorized_content['URLs'])
        
        # Add the cleaned data to the output dictionary
        all_files_categorized_content[formatted_filename] = categorized_content

In [4]:
# Save the entire dictionary as a single JSON file
output_json_path = "/Users/dong-ju/Documents/My_code/ttp_autogpt/files/output/output.json"
with open(output_json_path, "w") as outfile:
    json.dump(all_files_categorized_content, outfile)