In [None]:
import re
from collections import Counter
import csv
import os
import time

In [None]:
asm_punctuation = [
    ":", ",", ";", ".", "[", "]", "(", ")", "*", "+", "-", "=", "#", "$", "@", "?",
    "&", "!", "|", "^", "<", ">", "\\", "_"
]

dll_extensions = [r"\.dll", r"\.drv", r"\.ocx", r"\.cpl", r"\.scr", r"\.sys"]

In [None]:
def count_occurrences(file_path):
    # Initialize counters
    punct_counter = Counter()
    dll_counter = Counter()

    # Read the file content
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        content = file.read()

    # Count punctuation occurrences
    for punct in asm_punctuation:
        punct_count = len(re.findall(re.escape(punct), content))
        punct_counter[punct] += punct_count

    # Count DLL extensions occurrences
    for dll_ext in dll_extensions:
        dll_count = len(re.findall(dll_ext, content, re.IGNORECASE))
        dll_counter[dll_ext] += dll_count

    return punct_counter, dll_counter

# Write occurence counts to csv (Just need to run this one)
def write_occurences_csv(file_paths):
    punct_occurences = {}
    dll_occurences = {}
    for file_path in file_paths:
        punct_occurence, dll_occurence = count_occurrences(file_path)
        punct_occurences[file_path] = punct_occurence
        dll_occurences[file_path] = dll_occurence
    write_punctuation_to_csv('punctuation_occurences.csv', punct_occurences, file_paths)
    write_dll_to_csv('dll_occurences.csv', dll_occurences, file_paths)

# Write punctuation occurence to csv
def write_punctuation_to_csv(output_csv, punct_occurences, filenames):
    with open(output_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)

        # Write the header (first column is 'ID', then all punctuation symbols)
        headers = ['ID'] + asm_punctuation
        writer.writerow(headers)

        # Process each file and write punctuation counts
        for filename in filenames:
            punct_counter = punct_occurences[filename]

            # Prepare the row: First column is the file name
            row = [filename.rsplit('.', 1)[0]]

            # For each punctuation symbol, add the count or 0 if not found
            for punct in asm_punctuation:
                row.append(punct_counter.get(punct, 0))

            # Write the row to the CSV
            writer.writerow(row)


# Function to write DLL extension counts to a CSV file
def write_dll_to_csv(output_csv, dll_occurences, filenames):
    with open(output_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)

        # Write the header (first column is 'ID', then all DLL extensions)
        headers = ['ID'] + dll_extensions
        writer.writerow(headers)

        # Process each file and write DLL extension counts
        for filename in filenames:
            dll_counter = dll_occurences[filename]
            row = [filename.rsplit('.', 1)[0]]

            # For each DLL extension, add the count or 0 if not found
            for dll_ext in dll_extensions:
                row.append(dll_counter.get(dll_ext, 0))

            # Write the row to the CSV
            writer.writerow(row)

In [None]:
write_occurences_csv(['0ACDbR5M3ZhBJajygTuf.asm'])

In [None]:
with open('punctuation_occurences.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        print(row)

['ID', ':', ',', ';', '.', '[', ']', '(', ')', '*', '+', '-', '=', '#', '$', '@', '?', '&', '!', '|', '^', '<', '>', '\\', '_']
['0ACDbR5M3ZhBJajygTuf', '275972', '97545', '108422', '277921', '698', '698', '1861', '86', '4217', '21624', '7250', '863', '0', '1670', '1996', '977', '1739', '1', '8', '0', '5', '5', '0', '1778']


In [None]:
with open('dll_occurences.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        print(row)

['ID', '\\.dll', '\\.drv', '\\.ocx', '\\.cpl', '\\.scr', '\\.sys']
['0ACDbR5M3ZhBJajygTuf', '12', '0', '0', '0', '0', '0']


In [None]:
# Get list of train file names
filenames = ["train/" + filename for filename in os.listdir("train/") if filename.endswith(".asm")]

# Create occurence CSV files and time it
start = time.time()
write_occurences_csv(filenames)
end = time.time()

# Print time
print(f"{int((end-start)//60)}m {int((end-start)%60)}s")

3m 33s
