In [5]:
import csv
from tqdm import tqdm

def process_memlog_in_chunks(file_path):
    stores_file = open('stores.csv', 'w', newline='')
    allocation_sites_file = open('allocation_sites.csv', 'w', newline='')

    stores_writer = csv.writer(stores_file)
    allocation_sites_writer = csv.writer(allocation_sites_file)

    # Write header for the CSV files
    stores_writer.writerow(['address', 'value'])
    allocation_sites_writer.writerow(['start', 'size', 'stack_trace'])

    store_started = False
    allocation_started = False
    allocation_sites = []
    stack_trace = []

    # Open the file and count the total number of lines for tqdm
    with open(file_path, 'r') as f:
        total_lines = sum(1 for line in f)  # Count total lines in the file
        f.seek(0)  # Rewind the file to start processing

        # Create a tqdm progress bar for line processing
        for line in tqdm(f, total=total_lines, desc="Processing lines", unit="line"):
            # Skip lines before "Parent PID" and the following line
            if 'Parent PID' in line:
                store_started = True
                continue
            elif store_started and line.strip() == '':
                continue

            # Process store lines
            if store_started and not allocation_started:
                # Ignore the line after the line containing 'Parent PID'
                if len(line.strip().split()) == 1:
                    continue

                # Look for the "=== Allocation sites ===" line
                if line.strip() == '=== Allocation sites ===':
                    allocation_started = True
                    continue

                # Process store lines: <address> <value>
                if line.strip():
                    address, value = line.split()
                    stores_writer.writerow([address, value])

            # Process allocation site lines
            if allocation_started:
                if line.strip():
                    if line.startswith('Start'):
                        # Capture the start and size from the allocation site line
                        start, size = line.split(',')[0].split()[1], line.split(',')[1].split()[1]
                        # If there's already a previous allocation site, write it before starting a new one
                        if allocation_sites:
                            allocation_sites_writer.writerow([allocation_sites[0], allocation_sites[1], ' '.join(allocation_sites[2])])
                        # Start a new allocation site
                        allocation_sites = [start, size, []]
                    else:
                        # Add stack trace line for the current allocation site
                        allocation_sites[2].append(line.strip())

    # After finishing the file, write any remaining allocation site
    if allocation_sites:
        allocation_sites_writer.writerow([allocation_sites[0], allocation_sites[1], ' '.join(allocation_sites[2])])

    # Close CSV files
    stores_file.close()
    allocation_sites_file.close()

# Example usage
process_memlog_in_chunks('memlog.log')

Processing lines: 100%|██████████| 622945/622945 [00:00<00:00, 976106.61line/s] 


In [1]:
import csv
from tqdm import tqdm
import bisect
import struct
import os

def hex_to_double(hex_str):
    try:
        hex_str = hex_str.lower().replace('0x', '').zfill(16)
        byte_data = bytes.fromhex(hex_str)
        return struct.unpack('>d', byte_data)[0]
    except (ValueError, struct.error) as e:
        print(f"Error converting hex to double for '{hex_str}': {e}")
        return None

def hex_to_int(hex_str):
    try:
        return int(hex_str, 16)
    except ValueError:
        print(f"Invalid hexadecimal address: {hex_str}")
        return None

def load_allocation_sites(allocation_sites_csv):
    allocation_sites = []
    with open(allocation_sites_csv, 'r', newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            start_hex = row['start'].strip()
            start = hex_to_int(start_hex)
            size = int(row['size'].strip()) if row['size'].strip().isdigit() else None
            if start is None or size is None:
                continue
            allocation_sites.append({
                'start_hex': start_hex,
                'start': start,
                'end': start + size,
                'size': size,
                'stack_trace': row['stack_trace'].strip(),
                'total_stores': 0,
                'zero_values': 0,
                'max_value': None,
                'min_value': None,
                'stores': []
            })
    allocation_sites.sort(key=lambda x: x['start_hex'])
    return allocation_sites

def find_allocation_site(allocation_sites, address):
    starts = [site['start'] for site in allocation_sites]
    index = bisect.bisect_right(starts, address) - 1
    if index >= 0 and allocation_sites[index]['start'] <= address < allocation_sites[index]['end']:
        return allocation_sites[index]
    return None

def analyze_allocation_sites_and_stores(stores_csv, allocation_sites_csv, output_csv, output_dir):
    print("Loading allocation sites...")
    allocation_sites = load_allocation_sites(allocation_sites_csv)
    if not allocation_sites:
        print("No valid allocation sites found. Exiting.")
        return
    os.makedirs(output_dir, exist_ok=True)
    print(f"Total allocation sites loaded: {len(allocation_sites)}")
    
    with open(stores_csv, 'r', newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader, None)
        if header is None:
            print("stores.csv is empty. Exiting.")
            return
        f.seek(0)
        next(reader)
        for line in tqdm(reader, desc="Processing stores", unit="store"):
            if len(line) < 2:
                continue
            address_hex = line[0].strip()
            address = hex_to_int(address_hex)
            value_hex = line[1].strip()
            value = hex_to_double(value_hex)
            if address is None or value is None:
                continue
            site = find_allocation_site(allocation_sites, address)
            if site:
                site['total_stores'] += 1
                site['stores'].append({
                    'address': address_hex, 
                    'value': value
                })
                if value == 0.0:
                    site['zero_values'] += 1
                if (site['max_value'] is None) or (value > site['max_value']):
                    site['max_value'] = value
                if (site['min_value'] is None) or (value < site['min_value']):
                    site['min_value'] = value

    print("Writing analysis results to CSV...")
    allocation_sites.sort(key=lambda x: x['size'])
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        fieldnames = ['start_hex', 'start', 'end', 'size', 'total_stores', 'zero_values', 'max_value', 'min_value', 'stack_trace']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for site in allocation_sites:
            writer.writerow({
                'start_hex': site['start_hex'],
                'start': site['start'],
                'end': site['end'],
                'size': site['size'],
                'total_stores': site['total_stores'],
                'zero_values': site['zero_values'],
                'max_value': site['max_value'] if site['max_value'] is not None else '',
                'min_value': site['min_value'] if site['min_value'] is not None else '',
                'stack_trace': site['stack_trace']
            })
    
    print("Writing stores to separate allocation site files...")
    for site in allocation_sites:
        filename = site['start_hex'] + ".csv"
        file = os.path.join(output_dir, filename)
        with open(file, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['address', 'value'])
            writer.writeheader()
            writer.writerows(site['stores'])
    print(f"Analysis complete. Files saved to {output_dir}.")

analyze_allocation_sites_and_stores(
    stores_csv='stores.csv',
    allocation_sites_csv='allocation_sites.csv',
    output_csv='allocation_analysis.csv',
    output_dir='allocation_site_stores'
)


Loading allocation sites...
Total allocation sites loaded: 2243


Processing stores: 607447store [00:31, 19128.45store/s]


Writing analysis results to CSV...
Writing stores to separate allocation site files...
Analysis complete. Files saved to allocation_site_stores.
