In [10]:
import codecs
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Define the path to the 'train' folder
train_folder = './train'

## File size as Feature

In [2]:
start = time.time()

# Initialize lists to store data for each file
file_ids = []
byte_file_sizes = []
asm_file_sizes = []

# Iterate through the files in the 'train' folder
for filename in os.listdir(train_folder):
    if filename.endswith(".bytes"):
        # Extract file ID (without extension) and the size of the bytes file
        file_id = filename.replace('.bytes', '')
        file_size_bytes = os.path.getsize(os.path.join(train_folder, filename))
        
        # Check if corresponding .asm file exists and get its size
        asm_filename = f"{file_id}.asm"
        asm_file_size = os.path.getsize(os.path.join(train_folder, asm_filename)) if os.path.exists(os.path.join(train_folder, asm_filename)) else None
        
        # Append data to lists
        file_ids.append(file_id)
        byte_file_sizes.append(file_size_bytes)
        asm_file_sizes.append(asm_file_size)

# Create a pandas DataFrame with the collected data
df = pd.DataFrame({
    'File ID': file_ids,
    'Bytes File Size': byte_file_sizes,
    'ASM File Size': asm_file_sizes
})

# Create the 'Ratio' column (Bytes/ASM File Size), handling division by zero or missing ASM file sizes
df['Ratio'] = df['Bytes File Size'] / df['ASM File Size']

# Save the DataFrame to a CSV file
csv_filename = 'file_sizes.csv'
df.to_csv(csv_filename, index=False)

# Display the DataFrame and indicate CSV was saved
print(df)
print(f"DataFrame saved to {csv_filename}")

end = time.time()

print("Time taken: {} m {} s".format(int((end-start)//60), int((end-start)%60)))

                   File ID  Bytes File Size  ASM File Size     Ratio
0     04hSzLv5s2TDYPlcgpHB           460288        1059502  0.434438
1     05aiMRw13bYWqZ8OHvjl          7379456       15800755  0.467032
2     065EZhxgbLRSHsB87uIF          8359424       96072155  0.087012
3     08BX5Slp2I1FraZWbc6j          1098752        4738670  0.231869
4     0aVNj3qFgEZI6Akf4Kuv           445440        1414060  0.315008
...                    ...              ...            ...       ...
1595  LH5pzdDSPOtgIaBC1jWo           112628        1339326  0.084093
1596  ljFT1KeZmEiHxhuRbrcd           623616        4280258  0.145696
1597  ljuryB4bfagHqV5FM9Ae          2331136       11826930  0.197104
1598  loIP1tiwELF9YNZQjSUO          2331136       11816882  0.197272
1599  lS0IVqXeJrN6Dzi9Pap1           623616        3719060  0.167681

[1600 rows x 4 columns]
DataFrame saved to file_sizes.csv
Time taken: 0 m 1 s


## Prefixes Count

In [11]:
start = time.time()

prefixes = ['HEADER:', '.text:', '.Pav:', '.idata:', '.data:', '.bss:', 
            '.rdata:', '.edata:', '.rsrc:', '.tls:', '.reloc:', '.BSS:', '.CODE']

# Function to count prefixes in an asm file
def count_prefixes_in_file(filename):
    try:
        file_id = filename.replace('.asm', '')

        # Initialize a dictionary to store counts for this file
        file_prefix_count = {prefix: 0 for prefix in prefixes}

        # Open and read the asm file
        asm_path = os.path.join(train_folder, filename)
        with codecs.open(asm_path, 'r', encoding='cp1252', errors='replace') as file:
            for line in file:
                # Check each prefix and count its occurrences in the file
                for prefix in prefixes:
                    if prefix in line:
                        file_prefix_count[prefix] += 1

        # Return file_id and the prefix counts
        return (file_id, file_prefix_count)
    
    except Exception as e:
        # Handle exceptions and return an empty result
        print(f"Error processing file {filename}: {e}")
        return None

# Parallelize the process using ThreadPoolExecutor
file_ids = []
prefix_counts = {prefix: [] for prefix in prefixes}

# Create a list of asm files
asm_files = [f for f in os.listdir(train_folder) if f.endswith(".asm")]

# Use ThreadPoolExecutor to parallelize the process
with ThreadPoolExecutor() as executor:
    futures = {executor.submit(count_prefixes_in_file, asm_file): asm_file for asm_file in asm_files}

    for future in as_completed(futures):
        result = future.result()
        if result is not None:
            file_id, file_prefix_count = result
            file_ids.append(file_id)
            for prefix in prefixes:
                prefix_counts[prefix].append(file_prefix_count[prefix])

# Create a pandas DataFrame with the file ID and prefix counts
df2 = pd.DataFrame({'File ID': file_ids})
for prefix in prefixes:
    df2[prefix] = prefix_counts[prefix]

# Save the DataFrame to a CSV file
csv_filename = 'asm_prefix_counts.csv'
df2.to_csv(csv_filename, index=False)

# Print confirmation message
print(f"DataFrame saved to {csv_filename}")

end = time.time()

print("Time taken: {} m {} s".format(int((end-start)//60), int((end-start)%60)))

DataFrame saved to asm_prefix_counts.csv
Time taken: 65 m 10 s


In [13]:
print(df2)

                   File ID  HEADER:  .text:  .Pav:  .idata:   .data:  .bss:  \
0     0eN9lyQfwmTVk7C2ZoYp       25    1150      0        0     1039      0   
1     0hZEqJ5eMVjU21HAG7Ii       26    1283      0        0     1041      0   
2     04hSzLv5s2TDYPlcgpHB       26    9248      0      245     6331      0   
3     0ItXlAUOhK8ZYdDf7HW4       26    8770      0      206     4600     92   
4     0aVNj3qFgEZI6Akf4Kuv       26    8298      0      147     9831   7164   
...                    ...      ...     ...    ...      ...      ...    ...   
1595  loIP1tiwELF9YNZQjSUO       24     631      0      109   264208      0   
1596  JUO3pfywZnC4e9xHLBMA        0    5350      0      641     4443      0   
1597  KNP2ROq6J8YEcmyrtSjV        0   18101      0      353    30935      0   
1598  k2mxrqNg1JzRiVsIbytQ        0    7994      0      545  2355235      0   
1599  KqEgONxfHdP5lLaBIGQk        0    9828      0      607  2511620      0   

      .rdata:  .edata:  .rsrc:  .tls:  .reloc:  .BS